Import Packages¶

In [1]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt

pd.set_option('display.max_columns', None)
%matplotlib inline
C:\Users\micha\anaconda3\lib\site-packages\numpy\_distributor_init.py:30: UserWarning: loaded more than 1 DLL from .libs:
C:\Users\micha\anaconda3\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
C:\Users\micha\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-246-g3d31191b-gcc_10_3_0.dll
  warnings.warn("loaded more than 1 DLL from .libs:"

Import fungsi repository yang diperlukan

In [2]:
def timer(start_time, header='\nProgram Schedule'):
    end_time = pd.to_datetime('now', utc=True) + pd.Timedelta('07:00:00')
    print('\n\033[1m' + header + ':\033[0m')
    print(start_time.strftime('Start   : %Y-%m-%d %I:%M:%S %p'))
    print(end_time.strftime('Finish  : %Y-%m-%d %I:%M:%S %p'))
    run_time = str(end_time - start_time)
    idx1 = run_time.find(' ', 3) + 1
    idx2 = run_time.find('.')
    print('Runtime :', run_time[idx1 : idx2], '\n')
    
def category_counts(df, col, header=None, decimal=1, asc=False, top=10, style=True):
    header = col if header is None else header
    table = df[col].value_counts().rename_axis(header).reset_index(name='Count')
    table['Percentage'] = table['Count'] / table['Count'].sum(axis=0)
    
    if asc:
        table.sort_values(by=header, inplace=True)
        table.reset_index(drop=True, inplace=True)
        
    table['Cumulative'] = table['Percentage'].cumsum()
    table.index += 1
    
    table = table.head(top)
    
    if style:
        pct = '{:.' + str(decimal) + '%}'
        table = table.style.format({'Count': '{:,}', 'Percentage': pct, 'Cumulative': pct})
        
    return table

def memory_usage(df, unit='MiB'):
    size = df.memory_usage(index=True, deep=True).sum()
    
    if unit == 'GiB':
        print('Memory usage:', f'{size / 2**30:,.2f}', unit)
    elif unit == 'MiB':
        print('Memory usage:', f'{size / 2**20:,.2f}', unit)
    else:
        print('Memory usage:', f'{size / 2**10:,.2f}', 'kiB')

from scipy.stats import chi2_contingency

def target_pivot(df, var, target, count, percent=True, ncat=10, alpha=0.05, style=True):
    n = len(df[var].unique())
    
    if n <= ncat:
        pvt = pd.pivot_table(df, values=count, index=var, columns=target, aggfunc='count')
        pval = chi2_contingency(pvt)[1]
        
        print('P-value:', f'{pval:.2%}')
        
        if pval <= alpha:
            print('\033[1mConclusion:\033[0m There is a statistically significant association between\033[1m', var, 
                  '\033[0mand\033[1m', target, '\033[0m\n')
            
            print('The differences between the observed counts and the expected counts:')
            display((pvt - chi2_contingency(pvt)[3]).round(1))
            
        else:
            print('\033[1mConclusion:\033[0m There is NOT enough evidence to conclude that\033[1m', var, 
                  '\033[0mand\033[1m', target, '\033[0mare associated\n')
        
        if percent:
            cols = pvt.columns
            total = pvt.sum(1)

            for col in cols:
                pvt[col] = pvt[col] / total
                
                if col in [1, True, 'Y', 'Yes', 'YES']:
                    pos = col
                
            pvt.sort_values(by=pos, ascending=False, inplace=True) 
            
        if style:
            fmt = '{:,.2%}' if percent else '{:,}'
            pvt = pvt.style.format(fmt)
            
        print('Pivot table between variable\033[1m', var, '\033[0mand\033[1m', target, '\033[0m(in percent):')
            
        display(pvt)
    
    else:
        print('Pivot table for a high cardinality variable will not be shown.', 
              'Variable\033[1m', var, '\033[0mhas', f'{n:,}', 'unique values.')

import matplotlib

#matplotlib.use('TkAgg')
    
def boxplot(data, variable, category, threshold=0.8, plotsize=(7,7), colors=['green', 'red'], limits=None):
    if len(data[category].unique()) == 2:
        colors = colors
        alpha = 0.5
        
        x, y = data[[variable, category]].T.values
    else:
        colors = None
        alpha = 1
    
    fig, ax = plt.subplots(figsize=plotsize)
    
    colors = colors if len(data[category].unique()) == 2 else None
    alpha = 0.5 if len(data[category].unique()) == 2 else 1
    
    sns.boxplot(x=category, y=variable, data=data, ax=ax, 
                showmeans=True, meanline=True, palette=colors, width=0.3,
                flierprops=dict(marker='o', markerfacecolor='red', markeredgecolor='white', alpha=alpha),
                meanprops=dict(linestyle='-', linewidth=2, color='orange'),
                medianprops=dict(linestyle='-', linewidth=2, color='white'), 
                boxprops=dict(linewidth=0, alpha=0.7))
    
    ax.set(ylim=limits)
    
    plt.yticks(rotation=0)
    
    plt.show()
    
import phik
import ppscore as pps

def predictive_power(data, variables, target=None, plotsize=(20,20)):
    cols = variables + [target] if target != None else variables
    
    corrs = pps.matrix(data[cols]).pivot(columns='x', index='y', values='ppscore')
    print('\033[1mPredictive Power Score Matrix:\033[0m')
        
    fig, ax = plt.subplots(figsize=plotsize)
    ax = sns.heatmap(corrs, annot=True, fmt='.1%', cmap='RdYlGn', cbar_kws={'shrink': 0.8}, 
                     vmin=0, center=0.5, vmax=1, square=True, linewidths=0.1, linecolor='white')
    plt.show()
    
from statsmodels.tools.tools import add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor

def vif(data, variables, style=True):
    # Add constant to the GLM variables
    X = add_constant(data[variables])
    vif_values = [variance_inflation_factor(X.values, i) for i in range(1, X.shape[1])]
    vif_df = pd.DataFrame({'Variable': variables, 'VIF': vif_values})
    vif_df = vif_df.sort_values('VIF', ascending=False).reset_index(drop=True)
    vif_df.index += 1

    if style:
        vif_df = vif_df.style.format({'VIF': '{:,.2f}'})
        
    return vif_df

def table(data, values, index, columns=None, top=10, icol=0, asc=True, style=True, idxwidth='125px', colwidth='50px'):
    print('\n\033[1mInfo in form of', values, 'level:\033[0m')
    
    if columns is None:
        tab1 = data.groupby(index, as_index=False).agg(Count=(values, 'nunique'))
        tab1['Percentage'] = tab1['Count'] / tab1['Count'].sum(axis=0)

        if asc:
            tab1.sort_values(by='Count', ascending=False, inplace=True, ignore_index=True)

        tab1['Cumulative'] = tab1['Percentage'].cumsum()
        tab1.index += 1

        tab1 = tab1.loc[tab1['Count'] > 0].head(top)

        if style:
            pct = '{:.1%}'
            tab1 = tab1.style.format({'Count': '{:,}', 'Percentage': pct, 'Cumulative': pct})

        return tab1
    
    else:
        tab1 = pd.pivot_table(data=data, values=values, index=index, columns=columns, aggfunc='nunique', fill_value=0)
        
        if asc:
            tab1.sort_values(by=tab1.columns[icol], ascending=False, inplace=True)

        tab2 = tab1 / tab1.sum()
        tab3 = tab1.div(tab1.sum(axis=1), axis=0)

        if style:
            index_styles = [{'selector': 'th.row_heading', 'props': 'width: ' + idxwidth + ';'},
                            {'selector': 'th.col_heading', 'props': 'width: ' + colwidth + ';'}]

            display(tab1.head(top).style.format('{:,.0f}').set_table_styles(index_styles))
            display(tab2.head(top).style.format('{:,.1%}').set_table_styles(index_styles))
            display(tab3.head(top).style.format('{:,.1%}').set_table_styles(index_styles))
        else:
            return tab1, tab2, tab3

Import Datasets¶

In [3]:
agent_data = pd.read_parquet('agent.parquet')
beneficiary_data = pd.read_parquet('beneficiary.parquet')
client_data = pd.read_parquet('client.parquet')
policy_data = pd.read_parquet('policy.parquet')

Data Cleaning and Preparation¶

Membuat 2 kolom baru pada data beneficiary.

1. Kolom BENEFICIARY_TYPE_PRIMARY yang memiliki value 1 apabila BENEFICIARY_TYPE merupakan Primary dan 0 apabila tidak
2. Kolom BENEFICIARY_TYPE_SECONDARY yang memiliki value 1 apabila BENEFICIARY_TYPE merupakan SECONDARY dan 0 apabila tidak

Kolom-kolom baru tersebut akan dipakai dan digabungkan dengan data yang akan dipakai dalam pemodelan. Untuk sementara, kolom-kolom tersebut dipisah dan dibuatkan kedalam dataframe baru yaitu 'benedata'

In [4]:
beneficiary_data['BENEFICIARY_TYPE_PRIMARY'] = np.where(beneficiary_data['BENEFICIARY_TYPE'] == 'Primary', 1, 0)
beneficiary_data['BENEFICIARY_TYPE_SECONDARY'] = np.where(beneficiary_data['BENEFICIARY_TYPE'] == 'Secondary', 1, 0)
benedata = beneficiary_data[['CLIENT_ID', 'BENEFICIARY_TYPE_PRIMARY', 'BENEFICIARY_TYPE_SECONDARY']]
benedata = benedata.groupby('CLIENT_ID').sum().reset_index(drop=False)

Mengubah data-data waktu menjadi datetime

In [5]:
agent_data['DOB'] = pd.to_datetime(agent_data['DOB'])
agent_data['CONTRACT_DATE'] = pd.to_datetime(agent_data['CONTRACT_DATE'])
agent_data['TERMINATION_DATE'] = pd.to_datetime(agent_data['TERMINATION_DATE'])

client_data['DOB'] = pd.to_datetime(client_data['DOB'])

policy_data['APPLICATION_DATE'] = pd.to_datetime(policy_data['APPLICATION_DATE'])
policy_data['ISSUE_DATE'] = pd.to_datetime(policy_data['ISSUE_DATE'])

Melihat apakah ada data kosong pada policy_data

In [6]:
policy_data.isnull().sum()
Out[6]:
POLICY_ID           0
CLIENT_ID           0
APPLICATION_DATE    0
ISSUE_DATE          0
POLICY_STATUS       0
PRODUCT_CODE        0
PRODUCT_TYPE        0
PAYMENT_TYPE        0
PAYMENT_TERM        0
PREMIUM             0
RIDER_COUNT         0
AGENT_ID            0
dtype: int64

Menampilkan tabel untuk melihat tanggal aplikasi terawal dan terakhir pada masing-masing produk yang terjual pada policy_data

In [7]:
policy_data.groupby('PRODUCT_CODE').agg(FIRST_APPLICATION_DATE=('APPLICATION_DATE','min'),
                                LAST_APPLICATION_DATE=('APPLICATION_DATE','max'))
Out[7]:
FIRST_APPLICATION_DATE LAST_APPLICATION_DATE
PRODUCT_CODE
P01 2014-12-02 2019-12-23
P02 2014-12-02 2019-12-19
P03 2014-12-04 2019-12-15
P04 2014-12-06 2019-12-21
P05 2014-12-06 2019-12-22
P06 2014-12-07 2019-12-21
P07 2014-12-07 2019-12-21
P08 2014-12-08 2019-12-23
P09 2014-12-08 2019-12-24
P10 2014-12-08 2019-12-17
P11 2014-12-09 2019-12-23
P12 2014-12-10 2019-12-19
P13 2014-12-10 2019-12-21
P14 2014-12-10 2019-12-24
P15 2014-12-13 2019-12-19
P16 2014-12-14 2019-12-23
P17 2014-12-14 2019-12-12
P18 2014-12-15 2019-12-22
P19 2014-12-16 2019-12-15
P20 2014-12-19 2019-12-26
P21 2015-01-13 2019-12-15
P22 2015-01-22 2019-12-15
P23 2015-08-18 2019-12-14
P24 2015-08-20 2019-12-24
P25 2015-08-25 2019-12-18
P26 2016-03-28 2019-11-29
P27 2019-02-11 2019-12-30
P28 2019-02-12 2019-12-27
P29 2019-02-12 2019-12-26
P30 2019-02-12 2019-12-30
P31 2019-02-13 2019-12-29
P32 2019-02-15 2019-12-27
P33 2019-02-17 2019-12-20

Membuat filter untuk produk yang terjual hanya hingga awal tahun 2015 lalu membuatnya menjadi tipe data kategorikal

In [8]:
#SET PRODUCT CODE DARI 1 - 22
products = ['P' + str(i).zfill(2) for i in range (1,23)]
policy_data.query("PRODUCT_CODE in @products", inplace=True)
policy_data.reset_index(drop=True, inplace=True)
policy_data['PRODUCT_CODE'] = policy_data['PRODUCT_CODE'].astype(str).astype('category')
policy_data.head()
Out[8]:
POLICY_ID CLIENT_ID APPLICATION_DATE ISSUE_DATE POLICY_STATUS PRODUCT_CODE PRODUCT_TYPE PAYMENT_TYPE PAYMENT_TERM PREMIUM RIDER_COUNT AGENT_ID
0 02015000001 12015010001 2014-12-02 2015-01-01 Inforce P01 Unit-Linked Single Annually 4061000.0 5 3201207023
1 02015000002 12015010002 2014-12-02 2015-01-01 Inforce P02 Unit-Linked Regular Quarterly 31300.0 4 3201302026
2 02015000003 12015010003 2014-12-04 2015-01-03 Inforce P03 Unit-Linked Single Annually 1400000.0 2 3201310043
3 02015000004 12015010004 2014-12-06 2015-01-02 Inforce P04 Unit-Linked Regular Monthly 20800.0 4 3201403029
4 02015000005 12015010005 2014-12-06 2015-01-03 Inforce P02 Unit-Linked Regular Semi-Annually 160000.0 3 3201306021

Membuat 2 kolom baru pada policy_data.

1. POLICY_OWNERSHIP yang menunjukkan jumlah polis yang dimiliki oleh client dengan client ID tertentu
2. PURCHASE_ORDER yang menunjukkan urutan pembelian polis oleh client
In [9]:
policy_data.sort_values(by = ['CLIENT_ID', 'APPLICATION_DATE', 'ISSUE_DATE'],ignore_index = True, inplace = True)
policy_data = policy_data.assign(POLICY_OWNERSHIP = policy_data.groupby('CLIENT_ID')['POLICY_ID'].transform('count').astype('int16'))
policy_data = policy_data.assign(PURCHASE_ORDER = policy_data.groupby(['CLIENT_ID'])['ISSUE_DATE'].rank(method = 'first').astype('int16'))
policy_data.tail()
Out[9]:
POLICY_ID CLIENT_ID APPLICATION_DATE ISSUE_DATE POLICY_STATUS PRODUCT_CODE PRODUCT_TYPE PAYMENT_TYPE PAYMENT_TERM PREMIUM RIDER_COUNT AGENT_ID POLICY_OWNERSHIP PURCHASE_ORDER
41861 02019015369 12019121069 2019-12-24 2019-12-31 Inforce P14 Traditional Single Annually 3240000.0 1 3201210011 1 1
41862 02019015377 12019121077 2019-12-25 2019-12-26 Inforce P20 Traditional Regular Quarterly 13800.0 2 3201912113 1 1
41863 02019015378 12019121078 2019-12-25 2019-12-26 Inforce P20 Traditional Single Annually 10500100.0 4 3201912188 1 1
41864 02019015395 12019121095 2019-12-26 2019-12-27 Inforce P20 Traditional Regular Annually 344300.0 5 3201912122 1 1
41865 02019015397 12019121097 2019-12-26 2019-12-27 Inforce P20 Traditional Regular Monthly 57500.0 2 3201912214 1 1

Membuat 2 dataframe baru dan digabungkan menjadi dataframe bernama "df"

1. data1 yang berisi client-client yang memiliki 2 polis dimana urutan pembelian polisnya yang pertama
2. data2 yang berisi client-client yang memiliki 2 polis dimana urutan pembelian polisnya yang kedua
In [10]:
data1 = policy_data.query("POLICY_OWNERSHIP == 2 and PURCHASE_ORDER == 1")
data2 = policy_data.query("POLICY_OWNERSHIP == 2 and PURCHASE_ORDER == 2")

data3 = policy_data.copy()
In [11]:
cols = ['CLIENT_ID','ISSUE_DATE','PRODUCT_CODE','PREMIUM', 'PAYMENT_TERM']
df = pd.merge(data1, data2[cols], on='CLIENT_ID', suffixes=('_1','_2'))
df.reset_index(drop=True, inplace=True)
del data1,data2
df.tail()
Out[11]:
POLICY_ID CLIENT_ID APPLICATION_DATE ISSUE_DATE_1 POLICY_STATUS PRODUCT_CODE_1 PRODUCT_TYPE PAYMENT_TYPE PAYMENT_TERM_1 PREMIUM_1 RIDER_COUNT AGENT_ID POLICY_OWNERSHIP PURCHASE_ORDER ISSUE_DATE_2 PRODUCT_CODE_2 PREMIUM_2 PAYMENT_TERM_2
5834 02019013120 12019110389 2019-10-31 2019-11-27 Inforce P09 Unit-Linked Regular Quarterly 12500.0 3 3201307071 2 1 2019-12-12 P09 21300.0 Quarterly
5835 02019013319 12019110540 2019-11-04 2019-11-26 Inforce P08 Unit-Linked Regular Quarterly 12800.0 2 3201412069 2 1 2019-12-23 P08 11300.0 Quarterly
5836 02019013396 12019110599 2019-11-06 2019-11-18 Inforce P09 Unit-Linked Single Annually 986100.0 3 3201209050 2 1 2019-12-08 P09 49300.0 Semi-Annually
5837 02019013633 12019110762 2019-11-11 2019-11-17 Inforce P17 Traditional Regular Monthly 1300.0 4 3201202042 2 1 2019-12-27 P05 11000.0 Semi-Annually
5838 02019013558 12019120026 2019-11-09 2019-12-08 Inforce P19 Unit-Linked Regular Semi-Annually 124800.0 3 3201405074 2 1 2019-12-21 P19 104000.0 Quarterly

df lalu difilter sehingga hanya berisikan produk-produk dengan kode P03, P07, dan P08. Lalu, df akan difilter kembali dengan syarat produk pembelian pertama tidak sama dengan produk pembelian kedua.

In [12]:
df = df.query("PRODUCT_CODE_2 == 'P03' or PRODUCT_CODE_2 == 'P07' or PRODUCT_CODE_2 == 'P08'")
In [13]:
df = df[df['PRODUCT_CODE_1'] != df['PRODUCT_CODE_2']]
df
Out[13]:
POLICY_ID CLIENT_ID APPLICATION_DATE ISSUE_DATE_1 POLICY_STATUS PRODUCT_CODE_1 PRODUCT_TYPE PAYMENT_TYPE PAYMENT_TERM_1 PREMIUM_1 RIDER_COUNT AGENT_ID POLICY_OWNERSHIP PURCHASE_ORDER ISSUE_DATE_2 PRODUCT_CODE_2 PREMIUM_2 PAYMENT_TERM_2
6 02015000024 12015010024 2014-12-10 2015-01-03 Inforce P12 Traditional Single Annually 1601500.0 3 3201202039 2 1 2016-11-16 P08 37500.0 Quarterly
16 02015000062 12015010062 2014-12-16 2015-01-02 Inforce P09 Unit-Linked Regular Semi-Annually 16200.0 2 3201303041 2 1 2019-02-25 P07 7200.0 Semi-Annually
23 02015000087 12015010087 2014-12-18 2015-01-09 Inforce P12 Traditional Regular Annually 180000.0 2 3201405003 2 1 2015-11-06 P08 32500.0 Quarterly
24 02015000088 12015010088 2014-12-18 2015-01-12 Inforce P12 Traditional Single Annually 7500000.0 2 3201401012 2 1 2017-12-07 P08 175000.0 Semi-Annually
28 02015000096 12015010096 2014-12-19 2015-01-07 Inforce P09 Unit-Linked Regular Quarterly 25400.0 4 3201210024 2 1 2017-01-27 P07 14100.0 Monthly
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5811 02019009988 12019090169 2019-08-24 2019-09-01 Inforce P17 Traditional Regular Annually 39000.0 4 3201301010 2 1 2019-12-02 P03 360000.0 Annually
5812 02019010240 12019090303 2019-08-29 2019-09-16 Inforce P09 Unit-Linked Regular Monthly 3300.0 1 3201311009 2 1 2019-11-25 P07 4000.0 Quarterly
5818 02019011032 12019090845 2019-09-16 2019-09-26 Inforce P17 Traditional Regular Quarterly 17500.0 4 3201208034 2 1 2019-12-20 P03 11700.0 Monthly
5826 02019011802 12019100503 2019-10-03 2019-10-16 Inforce P09 Unit-Linked Regular Annually 30000.0 3 3201206030 2 1 2019-11-29 P07 25500.0 Quarterly
5828 02019012018 12019100658 2019-10-08 2019-10-16 Inforce P12 Traditional Regular Quarterly 38300.0 4 3201405062 2 1 2019-11-14 P08 25500.0 Quarterly

750 rows × 18 columns

Membuat kolom baru pada df yang bernama PURCHASE_PERIOD yang merupakan periode antar pembelian produk pada client dalam hitungan bulan.

In [14]:
df['PURCHASE_PERIOD'] = ((df['ISSUE_DATE_2'] - df['ISSUE_DATE_1']) / np.timedelta64(1, 'M')).round(1)
df
Out[14]:
POLICY_ID CLIENT_ID APPLICATION_DATE ISSUE_DATE_1 POLICY_STATUS PRODUCT_CODE_1 PRODUCT_TYPE PAYMENT_TYPE PAYMENT_TERM_1 PREMIUM_1 RIDER_COUNT AGENT_ID POLICY_OWNERSHIP PURCHASE_ORDER ISSUE_DATE_2 PRODUCT_CODE_2 PREMIUM_2 PAYMENT_TERM_2 PURCHASE_PERIOD
6 02015000024 12015010024 2014-12-10 2015-01-03 Inforce P12 Traditional Single Annually 1601500.0 3 3201202039 2 1 2016-11-16 P08 37500.0 Quarterly 22.4
16 02015000062 12015010062 2014-12-16 2015-01-02 Inforce P09 Unit-Linked Regular Semi-Annually 16200.0 2 3201303041 2 1 2019-02-25 P07 7200.0 Semi-Annually 49.8
23 02015000087 12015010087 2014-12-18 2015-01-09 Inforce P12 Traditional Regular Annually 180000.0 2 3201405003 2 1 2015-11-06 P08 32500.0 Quarterly 9.9
24 02015000088 12015010088 2014-12-18 2015-01-12 Inforce P12 Traditional Single Annually 7500000.0 2 3201401012 2 1 2017-12-07 P08 175000.0 Semi-Annually 34.8
28 02015000096 12015010096 2014-12-19 2015-01-07 Inforce P09 Unit-Linked Regular Quarterly 25400.0 4 3201210024 2 1 2017-01-27 P07 14100.0 Monthly 24.7
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5811 02019009988 12019090169 2019-08-24 2019-09-01 Inforce P17 Traditional Regular Annually 39000.0 4 3201301010 2 1 2019-12-02 P03 360000.0 Annually 3.0
5812 02019010240 12019090303 2019-08-29 2019-09-16 Inforce P09 Unit-Linked Regular Monthly 3300.0 1 3201311009 2 1 2019-11-25 P07 4000.0 Quarterly 2.3
5818 02019011032 12019090845 2019-09-16 2019-09-26 Inforce P17 Traditional Regular Quarterly 17500.0 4 3201208034 2 1 2019-12-20 P03 11700.0 Monthly 2.8
5826 02019011802 12019100503 2019-10-03 2019-10-16 Inforce P09 Unit-Linked Regular Annually 30000.0 3 3201206030 2 1 2019-11-29 P07 25500.0 Quarterly 1.4
5828 02019012018 12019100658 2019-10-08 2019-10-16 Inforce P12 Traditional Regular Quarterly 38300.0 4 3201405062 2 1 2019-11-14 P08 25500.0 Quarterly 1.0

750 rows × 19 columns

In [15]:
fig, ax = plt.subplots(figsize=(7,7))
avg = df['PURCHASE_PERIOD'].mean()

ax = sns.histplot(data=df, x='PURCHASE_PERIOD', stat='percent', color='yellow')
ax.axvline(x=avg, color='red', ls="--", lw=2)
plt.show();
No description has been provided for this image

Didapatkan bahwa golden period untuk penjualan polis asuransi adalah 6 bulan. Oleh karena itu, dibuat kolom baru yang merupakan target untuk pemodelan yaitu 'Yes' apabila periode pembelian antar premi lebih kecil sama dengan 6 bulan, dan 'No' apabila tidak.

In [16]:
df['TARGET'] = np.where(df['PURCHASE_PERIOD'] > 6, 'No', 'Yes') # waktu untuk beli > 6, probabilitas untuk beli kedepannya semakin kecil
df.groupby(['PRODUCT_CODE_2', 'TARGET'])['PURCHASE_PERIOD'].describe().query('count > 0')
Out[16]:
count mean std min 25% 50% 75% max
PRODUCT_CODE_2 TARGET
P03 No 112.0 23.096429 12.390769 6.1 12.975 20.70 32.050 55.7
Yes 35.0 3.100000 1.905102 0.3 1.900 2.80 4.900 6.0
P07 No 322.0 23.420497 13.348549 6.1 12.150 20.25 33.075 55.8
Yes 71.0 3.007042 1.820073 0.1 1.400 3.20 4.450 5.9
P08 No 168.0 23.045238 12.833376 6.1 12.500 19.05 33.125 57.2
Yes 42.0 2.759524 1.648807 0.0 1.425 2.90 3.900 6.0

Dataframe 'benedata' lalu digabungkan dengan 'df' dengan syarat CLIENT_ID yang sama. Dataframe baru ini dinamakan 'data'

In [17]:
data = df.merge(client_data, on = 'CLIENT_ID').merge(benedata, on = 'CLIENT_ID').merge(agent_data, on = 'AGENT_ID',suffixes=('','_Agent'))
data
Out[17]:
POLICY_ID CLIENT_ID APPLICATION_DATE ISSUE_DATE_1 POLICY_STATUS PRODUCT_CODE_1 PRODUCT_TYPE PAYMENT_TYPE PAYMENT_TERM_1 PREMIUM_1 RIDER_COUNT AGENT_ID POLICY_OWNERSHIP PURCHASE_ORDER ISSUE_DATE_2 PRODUCT_CODE_2 PREMIUM_2 PAYMENT_TERM_2 PURCHASE_PERIOD TARGET GENDER DOB MARITAL_STATUS LOCATION NATIONALITY OCCUPATION ANNUAL_INCOME SMOKER PAYMENT_METHOD CHANNEL TOTAL_COVERAGE BENEFICIARY_TYPE_PRIMARY BENEFICIARY_TYPE_SECONDARY DOB_Agent MARITAL_STATUS_Agent TEAM_CODE CONTRACT_DATE AGENT_STATUS TERMINATION_DATE FIRST_YEAR_COMMISSION
0 02015000024 12015010024 2014-12-10 2015-01-03 Inforce P12 Traditional Single Annually 1601500.0 3 3201202039 2 1 2016-11-16 P08 37500.0 Quarterly 22.4 No Male 2004-02-12 Single BATANGAS PH GOVT-CAR 1000900.0 No Direct Other 2700000.0 2 0 1989-12-27 Married A0165 2012-02-16 Active NaT 853270.0
1 02015000062 12015010062 2014-12-16 2015-01-02 Inforce P09 Unit-Linked Regular Semi-Annually 16200.0 2 3201303041 2 1 2019-02-25 P07 7200.0 Semi-Annually 49.8 No Male 1985-01-06 Divorced QUEZON PH MISSIONARY 180000.0 No Direct Agent Partner 1050000.0 1 1 1988-10-20 Married A0136 2013-03-01 Active NaT 1172380.0
2 02015000087 12015010087 2014-12-18 2015-01-09 Inforce P12 Traditional Regular Annually 180000.0 2 3201405003 2 1 2015-11-06 P08 32500.0 Quarterly 9.9 No Male 2002-10-19 Married RIZAL PH BUSINESS 1000000.0 No Direct Agent Staff 5890900.0 1 1 1986-12-08 Married A0002 2014-05-24 Inactive 2023-01-05 840320.0
3 02015000088 12015010088 2014-12-18 2015-01-12 Inforce P12 Traditional Single Annually 7500000.0 2 3201401012 2 1 2017-12-07 P08 175000.0 Semi-Annually 34.8 No Male 1982-08-24 Divorced MANILA PH MANAG 5000000.0 No Direct Agent Staff 45342100.0 1 1 1985-08-30 Married A0080 2014-01-16 Active NaT 2690750.0
4 02018002785 12018040234 2018-03-28 2018-04-26 Inforce P09 Unit-Linked Regular Semi-Annually 40000.0 2 3201401012 2 1 2018-09-08 P07 27500.0 Quarterly 4.4 Yes Female 1996-08-30 Married ORIENTAL MINDORO PH MANAG 1000000.0 No Direct Agent Partner 5200000.0 1 1 1985-08-30 Married A0080 2014-01-16 Active NaT 2690750.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
745 02019009354 12019080681 2019-08-09 2019-08-28 Inforce P17 Traditional Regular Semi-Annually 16200.0 4 3201309011 2 1 2019-10-30 P03 216000.0 Annually 2.1 Yes Female 1999-05-28 Single ISABELA PH VENDOR 180000.0 No Single Premium Agent Partner 1000000.0 1 1 1987-08-02 Married A0068 2013-09-28 Inactive 2023-01-05 1086930.0
746 02019010240 12019090303 2019-08-29 2019-09-16 Inforce P09 Unit-Linked Regular Monthly 3300.0 1 3201311009 2 1 2019-11-25 P07 4000.0 Quarterly 2.3 Yes Female 2003-06-28 Separated MANILA PH MATERL PLN 200000.0 No Direct Agent Partner 3000000.0 1 1 1985-09-10 Married A0075 2013-11-18 Inactive 2023-01-05 844060.0
747 02019011032 12019090845 2019-09-16 2019-09-26 Inforce P17 Traditional Regular Quarterly 17500.0 4 3201208034 2 1 2019-12-20 P03 11700.0 Monthly 2.8 Yes Male 1990-04-15 Single CAMARINES SUR PH SALES MNGR 1000000.0 No Direct Broker 3178000.0 1 1 1984-03-12 Married A0113 2012-08-22 Active NaT 1501010.0
748 02019011802 12019100503 2019-10-03 2019-10-16 Inforce P09 Unit-Linked Regular Annually 30000.0 3 3201206030 2 1 2019-11-29 P07 25500.0 Quarterly 1.4 Yes Male 1992-07-05 Single METRO MANILA PH SEAMENOF 600000.0 No Single Premium Broker 1500000.0 1 1 1985-01-23 Married A0128 2012-06-15 Active NaT 961350.0
749 02019012018 12019100658 2019-10-08 2019-10-16 Inforce P12 Traditional Regular Quarterly 38300.0 4 3201405062 2 1 2019-11-14 P08 25500.0 Quarterly 1.0 Yes Male 1998-05-13 Married MANILA PH DOCTOR 1700000.0 No Single Premium Agent Partner 3006200.0 2 0 1988-01-10 Married A0115 2014-05-26 Active NaT 301110.0

750 rows × 40 columns

In [18]:
data.isnull().sum()
Out[18]:
POLICY_ID                       0
CLIENT_ID                       0
APPLICATION_DATE                0
ISSUE_DATE_1                    0
POLICY_STATUS                   0
PRODUCT_CODE_1                  0
PRODUCT_TYPE                    0
PAYMENT_TYPE                    0
PAYMENT_TERM_1                  0
PREMIUM_1                       0
RIDER_COUNT                     0
AGENT_ID                        0
POLICY_OWNERSHIP                0
PURCHASE_ORDER                  0
ISSUE_DATE_2                    0
PRODUCT_CODE_2                  0
PREMIUM_2                       0
PAYMENT_TERM_2                  0
PURCHASE_PERIOD                 0
TARGET                          0
GENDER                          0
DOB                             0
MARITAL_STATUS                  0
LOCATION                        0
NATIONALITY                     0
OCCUPATION                      0
ANNUAL_INCOME                   0
SMOKER                          0
PAYMENT_METHOD                  0
CHANNEL                         0
TOTAL_COVERAGE                  2
BENEFICIARY_TYPE_PRIMARY        0
BENEFICIARY_TYPE_SECONDARY      0
DOB_Agent                       0
MARITAL_STATUS_Agent            0
TEAM_CODE                       0
CONTRACT_DATE                   0
AGENT_STATUS                    0
TERMINATION_DATE              569
FIRST_YEAR_COMMISSION           0
dtype: int64

Karena kolom 'TERMINATION_DATE' memiliki banyak value yang kosong, kolom tersebut dihapus.

In [19]:
data.drop(columns=['TERMINATION_DATE'], inplace=True)
data
Out[19]:
POLICY_ID CLIENT_ID APPLICATION_DATE ISSUE_DATE_1 POLICY_STATUS PRODUCT_CODE_1 PRODUCT_TYPE PAYMENT_TYPE PAYMENT_TERM_1 PREMIUM_1 RIDER_COUNT AGENT_ID POLICY_OWNERSHIP PURCHASE_ORDER ISSUE_DATE_2 PRODUCT_CODE_2 PREMIUM_2 PAYMENT_TERM_2 PURCHASE_PERIOD TARGET GENDER DOB MARITAL_STATUS LOCATION NATIONALITY OCCUPATION ANNUAL_INCOME SMOKER PAYMENT_METHOD CHANNEL TOTAL_COVERAGE BENEFICIARY_TYPE_PRIMARY BENEFICIARY_TYPE_SECONDARY DOB_Agent MARITAL_STATUS_Agent TEAM_CODE CONTRACT_DATE AGENT_STATUS FIRST_YEAR_COMMISSION
0 02015000024 12015010024 2014-12-10 2015-01-03 Inforce P12 Traditional Single Annually 1601500.0 3 3201202039 2 1 2016-11-16 P08 37500.0 Quarterly 22.4 No Male 2004-02-12 Single BATANGAS PH GOVT-CAR 1000900.0 No Direct Other 2700000.0 2 0 1989-12-27 Married A0165 2012-02-16 Active 853270.0
1 02015000062 12015010062 2014-12-16 2015-01-02 Inforce P09 Unit-Linked Regular Semi-Annually 16200.0 2 3201303041 2 1 2019-02-25 P07 7200.0 Semi-Annually 49.8 No Male 1985-01-06 Divorced QUEZON PH MISSIONARY 180000.0 No Direct Agent Partner 1050000.0 1 1 1988-10-20 Married A0136 2013-03-01 Active 1172380.0
2 02015000087 12015010087 2014-12-18 2015-01-09 Inforce P12 Traditional Regular Annually 180000.0 2 3201405003 2 1 2015-11-06 P08 32500.0 Quarterly 9.9 No Male 2002-10-19 Married RIZAL PH BUSINESS 1000000.0 No Direct Agent Staff 5890900.0 1 1 1986-12-08 Married A0002 2014-05-24 Inactive 840320.0
3 02015000088 12015010088 2014-12-18 2015-01-12 Inforce P12 Traditional Single Annually 7500000.0 2 3201401012 2 1 2017-12-07 P08 175000.0 Semi-Annually 34.8 No Male 1982-08-24 Divorced MANILA PH MANAG 5000000.0 No Direct Agent Staff 45342100.0 1 1 1985-08-30 Married A0080 2014-01-16 Active 2690750.0
4 02018002785 12018040234 2018-03-28 2018-04-26 Inforce P09 Unit-Linked Regular Semi-Annually 40000.0 2 3201401012 2 1 2018-09-08 P07 27500.0 Quarterly 4.4 Yes Female 1996-08-30 Married ORIENTAL MINDORO PH MANAG 1000000.0 No Direct Agent Partner 5200000.0 1 1 1985-08-30 Married A0080 2014-01-16 Active 2690750.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
745 02019009354 12019080681 2019-08-09 2019-08-28 Inforce P17 Traditional Regular Semi-Annually 16200.0 4 3201309011 2 1 2019-10-30 P03 216000.0 Annually 2.1 Yes Female 1999-05-28 Single ISABELA PH VENDOR 180000.0 No Single Premium Agent Partner 1000000.0 1 1 1987-08-02 Married A0068 2013-09-28 Inactive 1086930.0
746 02019010240 12019090303 2019-08-29 2019-09-16 Inforce P09 Unit-Linked Regular Monthly 3300.0 1 3201311009 2 1 2019-11-25 P07 4000.0 Quarterly 2.3 Yes Female 2003-06-28 Separated MANILA PH MATERL PLN 200000.0 No Direct Agent Partner 3000000.0 1 1 1985-09-10 Married A0075 2013-11-18 Inactive 844060.0
747 02019011032 12019090845 2019-09-16 2019-09-26 Inforce P17 Traditional Regular Quarterly 17500.0 4 3201208034 2 1 2019-12-20 P03 11700.0 Monthly 2.8 Yes Male 1990-04-15 Single CAMARINES SUR PH SALES MNGR 1000000.0 No Direct Broker 3178000.0 1 1 1984-03-12 Married A0113 2012-08-22 Active 1501010.0
748 02019011802 12019100503 2019-10-03 2019-10-16 Inforce P09 Unit-Linked Regular Annually 30000.0 3 3201206030 2 1 2019-11-29 P07 25500.0 Quarterly 1.4 Yes Male 1992-07-05 Single METRO MANILA PH SEAMENOF 600000.0 No Single Premium Broker 1500000.0 1 1 1985-01-23 Married A0128 2012-06-15 Active 961350.0
749 02019012018 12019100658 2019-10-08 2019-10-16 Inforce P12 Traditional Regular Quarterly 38300.0 4 3201405062 2 1 2019-11-14 P08 25500.0 Quarterly 1.0 Yes Male 1998-05-13 Married MANILA PH DOCTOR 1700000.0 No Single Premium Agent Partner 3006200.0 2 0 1988-01-10 Married A0115 2014-05-26 Active 301110.0

750 rows × 39 columns

Kolom-kolom yang akan dipakai untuk pemodelan akan diubah tipenya sesuai dengan data-datanya.

In [20]:
cols = {
    'POLICY_ID': str,
    'CLIENT_ID': str,
    'APPLICATION_DATE': 'datetime64[ns]',
    'ISSUE_DATE_1': 'datetime64[ns]',
    'POLICY_STATUS': 'category',
    'PRODUCT_CODE_1': 'category',
    'PRODUCT_TYPE': 'category',
    'PAYMENT_TYPE': 'category',
    'PAYMENT_TERM_1': 'category',
    'PAYMENT_TERM_2': 'category',
    'PREMIUM_1': float,
    'RIDER_COUNT': int,
    'AGENT_ID': 'category',
    'POLICY_OWNERSHIP': int,
    'PURCHASE_ORDER': int,
    'ISSUE_DATE_2': 'datetime64[ns]',
    'PRODUCT_CODE_2': str,
    'PREMIUM_2': float,
    'PURCHASE_PERIOD': float,
    'TARGET': 'category',
    'GENDER': 'category',
    'DOB': 'datetime64[ns]',
    'MARITAL_STATUS': 'category',
    'LOCATION': str,
    'NATIONALITY': 'category',
    'OCCUPATION': str,
    'ANNUAL_INCOME': float,
    'SMOKER': 'category',
    'PAYMENT_METHOD': 'category',
    'CHANNEL': 'category',
    'TOTAL_COVERAGE': float,
    'BENEFICIARY_TYPE_PRIMARY': int,
    'BENEFICIARY_TYPE_SECONDARY': int,
    'DOB_Agent': 'datetime64[ns]',
    'MARITAL_STATUS_Agent': 'category',
    'TEAM_CODE': str,
    'CONTRACT_DATE': 'datetime64[ns]',
    'AGENT_STATUS': 'category',
    'FIRST_YEAR_COMMISSION': float,
}

data = data.astype(cols)
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 750 entries, 0 to 749
Data columns (total 39 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   POLICY_ID                   750 non-null    object        
 1   CLIENT_ID                   750 non-null    object        
 2   APPLICATION_DATE            750 non-null    datetime64[ns]
 3   ISSUE_DATE_1                750 non-null    datetime64[ns]
 4   POLICY_STATUS               750 non-null    category      
 5   PRODUCT_CODE_1              750 non-null    category      
 6   PRODUCT_TYPE                750 non-null    category      
 7   PAYMENT_TYPE                750 non-null    category      
 8   PAYMENT_TERM_1              750 non-null    category      
 9   PREMIUM_1                   750 non-null    float64       
 10  RIDER_COUNT                 750 non-null    int32         
 11  AGENT_ID                    750 non-null    category      
 12  POLICY_OWNERSHIP            750 non-null    int32         
 13  PURCHASE_ORDER              750 non-null    int32         
 14  ISSUE_DATE_2                750 non-null    datetime64[ns]
 15  PRODUCT_CODE_2              750 non-null    object        
 16  PREMIUM_2                   750 non-null    float64       
 17  PAYMENT_TERM_2              750 non-null    category      
 18  PURCHASE_PERIOD             750 non-null    float64       
 19  TARGET                      750 non-null    category      
 20  GENDER                      750 non-null    category      
 21  DOB                         750 non-null    datetime64[ns]
 22  MARITAL_STATUS              750 non-null    category      
 23  LOCATION                    750 non-null    object        
 24  NATIONALITY                 750 non-null    category      
 25  OCCUPATION                  750 non-null    object        
 26  ANNUAL_INCOME               750 non-null    float64       
 27  SMOKER                      750 non-null    category      
 28  PAYMENT_METHOD              750 non-null    category      
 29  CHANNEL                     750 non-null    category      
 30  TOTAL_COVERAGE              748 non-null    float64       
 31  BENEFICIARY_TYPE_PRIMARY    750 non-null    int32         
 32  BENEFICIARY_TYPE_SECONDARY  750 non-null    int32         
 33  DOB_Agent                   750 non-null    datetime64[ns]
 34  MARITAL_STATUS_Agent        750 non-null    category      
 35  TEAM_CODE                   750 non-null    object        
 36  CONTRACT_DATE               750 non-null    datetime64[ns]
 37  AGENT_STATUS                750 non-null    category      
 38  FIRST_YEAR_COMMISSION       750 non-null    float64       
dtypes: category(16), datetime64[ns](6), float64(6), int32(5), object(6)
memory usage: 162.7+ KB

Features Engineering¶

Dibawah merupakan proses untuk mengubah periode pembayaran premi menjadi jumlah per bulan (untuk kedua produk). Selanjutnya merupakan proses pembuatan kolom baru yaitu jumlah premi tahunan per produk (untuk kedua produk).

In [21]:
frequency_mapping = {
    'Annually': 1,
    'Semi-Annually': 2,
    'Quarterly': 4,
    'Monthly': 12
}

data['TERM_MAPPED_1'] = data['PAYMENT_TERM_1'].map(frequency_mapping).astype(int)
data['TERM_MAPPED_2'] = data['PAYMENT_TERM_2'].map(frequency_mapping).astype(int)
data['ANNUALIZED_PREMIUM_1'] = data['TERM_MAPPED_1'] * data['PREMIUM_1']
data['ANNUALIZED_PREMIUM_2'] = data['TERM_MAPPED_2'] * data['PREMIUM_2']
data
Out[21]:
POLICY_ID CLIENT_ID APPLICATION_DATE ISSUE_DATE_1 POLICY_STATUS PRODUCT_CODE_1 PRODUCT_TYPE PAYMENT_TYPE PAYMENT_TERM_1 PREMIUM_1 RIDER_COUNT AGENT_ID POLICY_OWNERSHIP PURCHASE_ORDER ISSUE_DATE_2 PRODUCT_CODE_2 PREMIUM_2 PAYMENT_TERM_2 PURCHASE_PERIOD TARGET GENDER DOB MARITAL_STATUS LOCATION NATIONALITY OCCUPATION ANNUAL_INCOME SMOKER PAYMENT_METHOD CHANNEL TOTAL_COVERAGE BENEFICIARY_TYPE_PRIMARY BENEFICIARY_TYPE_SECONDARY DOB_Agent MARITAL_STATUS_Agent TEAM_CODE CONTRACT_DATE AGENT_STATUS FIRST_YEAR_COMMISSION TERM_MAPPED_1 TERM_MAPPED_2 ANNUALIZED_PREMIUM_1 ANNUALIZED_PREMIUM_2
0 02015000024 12015010024 2014-12-10 2015-01-03 Inforce P12 Traditional Single Annually 1601500.0 3 3201202039 2 1 2016-11-16 P08 37500.0 Quarterly 22.4 No Male 2004-02-12 Single BATANGAS PH GOVT-CAR 1000900.0 No Direct Other 2700000.0 2 0 1989-12-27 Married A0165 2012-02-16 Active 853270.0 1 4 1601500.0 150000.0
1 02015000062 12015010062 2014-12-16 2015-01-02 Inforce P09 Unit-Linked Regular Semi-Annually 16200.0 2 3201303041 2 1 2019-02-25 P07 7200.0 Semi-Annually 49.8 No Male 1985-01-06 Divorced QUEZON PH MISSIONARY 180000.0 No Direct Agent Partner 1050000.0 1 1 1988-10-20 Married A0136 2013-03-01 Active 1172380.0 2 2 32400.0 14400.0
2 02015000087 12015010087 2014-12-18 2015-01-09 Inforce P12 Traditional Regular Annually 180000.0 2 3201405003 2 1 2015-11-06 P08 32500.0 Quarterly 9.9 No Male 2002-10-19 Married RIZAL PH BUSINESS 1000000.0 No Direct Agent Staff 5890900.0 1 1 1986-12-08 Married A0002 2014-05-24 Inactive 840320.0 1 4 180000.0 130000.0
3 02015000088 12015010088 2014-12-18 2015-01-12 Inforce P12 Traditional Single Annually 7500000.0 2 3201401012 2 1 2017-12-07 P08 175000.0 Semi-Annually 34.8 No Male 1982-08-24 Divorced MANILA PH MANAG 5000000.0 No Direct Agent Staff 45342100.0 1 1 1985-08-30 Married A0080 2014-01-16 Active 2690750.0 1 2 7500000.0 350000.0
4 02018002785 12018040234 2018-03-28 2018-04-26 Inforce P09 Unit-Linked Regular Semi-Annually 40000.0 2 3201401012 2 1 2018-09-08 P07 27500.0 Quarterly 4.4 Yes Female 1996-08-30 Married ORIENTAL MINDORO PH MANAG 1000000.0 No Direct Agent Partner 5200000.0 1 1 1985-08-30 Married A0080 2014-01-16 Active 2690750.0 2 4 80000.0 110000.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
745 02019009354 12019080681 2019-08-09 2019-08-28 Inforce P17 Traditional Regular Semi-Annually 16200.0 4 3201309011 2 1 2019-10-30 P03 216000.0 Annually 2.1 Yes Female 1999-05-28 Single ISABELA PH VENDOR 180000.0 No Single Premium Agent Partner 1000000.0 1 1 1987-08-02 Married A0068 2013-09-28 Inactive 1086930.0 2 1 32400.0 216000.0
746 02019010240 12019090303 2019-08-29 2019-09-16 Inforce P09 Unit-Linked Regular Monthly 3300.0 1 3201311009 2 1 2019-11-25 P07 4000.0 Quarterly 2.3 Yes Female 2003-06-28 Separated MANILA PH MATERL PLN 200000.0 No Direct Agent Partner 3000000.0 1 1 1985-09-10 Married A0075 2013-11-18 Inactive 844060.0 12 4 39600.0 16000.0
747 02019011032 12019090845 2019-09-16 2019-09-26 Inforce P17 Traditional Regular Quarterly 17500.0 4 3201208034 2 1 2019-12-20 P03 11700.0 Monthly 2.8 Yes Male 1990-04-15 Single CAMARINES SUR PH SALES MNGR 1000000.0 No Direct Broker 3178000.0 1 1 1984-03-12 Married A0113 2012-08-22 Active 1501010.0 4 12 70000.0 140400.0
748 02019011802 12019100503 2019-10-03 2019-10-16 Inforce P09 Unit-Linked Regular Annually 30000.0 3 3201206030 2 1 2019-11-29 P07 25500.0 Quarterly 1.4 Yes Male 1992-07-05 Single METRO MANILA PH SEAMENOF 600000.0 No Single Premium Broker 1500000.0 1 1 1985-01-23 Married A0128 2012-06-15 Active 961350.0 1 4 30000.0 102000.0
749 02019012018 12019100658 2019-10-08 2019-10-16 Inforce P12 Traditional Regular Quarterly 38300.0 4 3201405062 2 1 2019-11-14 P08 25500.0 Quarterly 1.0 Yes Male 1998-05-13 Married MANILA PH DOCTOR 1700000.0 No Single Premium Agent Partner 3006200.0 2 0 1988-01-10 Married A0115 2014-05-26 Active 301110.0 4 4 153200.0 102000.0

750 rows × 43 columns

Berikut merupakan proses penghapusan kolom yang tidak akan dipakai untuk pemodelan.

In [22]:
drop_col = ['POLICY_STATUS', 'PAYMENT_TYPE', 'MARITAL_STATUS_Agent', 
            'SMOKER', 'DOB_Agent', 'AGENT_ID','TEAM_CODE','APPLICATION_DATE','PURCHASE_ORDER',
            'OCCUPATION','PAYMENT_TERM_1','PAYMENT_TERM_2','AGENT_STATUS','FIRST_YEAR_COMMISSION',
            'TERM_MAPPED_1', 'TERM_MAPPED_2','PREMIUM_1','PREMIUM_2','CONTRACT_DATE']
data.drop(columns=drop_col, inplace=True)
data
Out[22]:
POLICY_ID CLIENT_ID ISSUE_DATE_1 PRODUCT_CODE_1 PRODUCT_TYPE RIDER_COUNT POLICY_OWNERSHIP ISSUE_DATE_2 PRODUCT_CODE_2 PURCHASE_PERIOD TARGET GENDER DOB MARITAL_STATUS LOCATION NATIONALITY ANNUAL_INCOME PAYMENT_METHOD CHANNEL TOTAL_COVERAGE BENEFICIARY_TYPE_PRIMARY BENEFICIARY_TYPE_SECONDARY ANNUALIZED_PREMIUM_1 ANNUALIZED_PREMIUM_2
0 02015000024 12015010024 2015-01-03 P12 Traditional 3 2 2016-11-16 P08 22.4 No Male 2004-02-12 Single BATANGAS PH 1000900.0 Direct Other 2700000.0 2 0 1601500.0 150000.0
1 02015000062 12015010062 2015-01-02 P09 Unit-Linked 2 2 2019-02-25 P07 49.8 No Male 1985-01-06 Divorced QUEZON PH 180000.0 Direct Agent Partner 1050000.0 1 1 32400.0 14400.0
2 02015000087 12015010087 2015-01-09 P12 Traditional 2 2 2015-11-06 P08 9.9 No Male 2002-10-19 Married RIZAL PH 1000000.0 Direct Agent Staff 5890900.0 1 1 180000.0 130000.0
3 02015000088 12015010088 2015-01-12 P12 Traditional 2 2 2017-12-07 P08 34.8 No Male 1982-08-24 Divorced MANILA PH 5000000.0 Direct Agent Staff 45342100.0 1 1 7500000.0 350000.0
4 02018002785 12018040234 2018-04-26 P09 Unit-Linked 2 2 2018-09-08 P07 4.4 Yes Female 1996-08-30 Married ORIENTAL MINDORO PH 1000000.0 Direct Agent Partner 5200000.0 1 1 80000.0 110000.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
745 02019009354 12019080681 2019-08-28 P17 Traditional 4 2 2019-10-30 P03 2.1 Yes Female 1999-05-28 Single ISABELA PH 180000.0 Single Premium Agent Partner 1000000.0 1 1 32400.0 216000.0
746 02019010240 12019090303 2019-09-16 P09 Unit-Linked 1 2 2019-11-25 P07 2.3 Yes Female 2003-06-28 Separated MANILA PH 200000.0 Direct Agent Partner 3000000.0 1 1 39600.0 16000.0
747 02019011032 12019090845 2019-09-26 P17 Traditional 4 2 2019-12-20 P03 2.8 Yes Male 1990-04-15 Single CAMARINES SUR PH 1000000.0 Direct Broker 3178000.0 1 1 70000.0 140400.0
748 02019011802 12019100503 2019-10-16 P09 Unit-Linked 3 2 2019-11-29 P07 1.4 Yes Male 1992-07-05 Single METRO MANILA PH 600000.0 Single Premium Broker 1500000.0 1 1 30000.0 102000.0
749 02019012018 12019100658 2019-10-16 P12 Traditional 4 2 2019-11-14 P08 1.0 Yes Male 1998-05-13 Married MANILA PH 1700000.0 Single Premium Agent Partner 3006200.0 2 0 153200.0 102000.0

750 rows × 24 columns

In [23]:
def plot_correlation_heatmap(data):
    pearson = data.corr()
    shape = np.triu(pearson)

    fig, ax = plt.subplots(figsize=(10, 8))
    ax = sns.heatmap(pearson, mask=shape, annot=True, fmt='.1%', cmap='RdYlGn_r', cbar_kws={'shrink': 0.8},
                     vmin=-1, center=0, vmax=1, square=True, linewidths=0.1, linecolor='white')
    plt.show()

Dibawah merupakan heatmap untuk korelasi pearson pada data yang akan dipakai untuk modelling. Dapat dilihat bahwa kolom 'BENEFICIARY_TYPE_SECONDARY' berkorelasi kuat dengan kolom 'BENEFICIARY_TYPE_PRIMARY'. Selain itu, kolom 'ANNUALIZED_PREMIUM_1' dan 'ANNUALIZED_PREMIUM_2' memiliki korelasi kuat dengan kolom 'ANNUAL_INCOME'

In [24]:
plot_correlation_heatmap(data)
C:\Users\micha\AppData\Local\Temp\ipykernel_16844\1305969554.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  pearson = data.corr()
No description has been provided for this image

Dibawah merupakan heatmap PPScore untuk data yang akan dipakai dalam pemodelan.

In [25]:
features = [col for col in data.columns if col not in ('POLICY_ID','CLIENT_ID','ISSUE_DATE_1','ISSUE_DATE_2',
                                                       'APPLICATION_DATE','DOB','PRODUCT_CODE_1',
                                                       'PRODUCT_CODE_2','CONTRACT_DATE','TARGET')]
predictive_power(data, features, 'TARGET')
C:\Users\micha\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:700: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=4.
  warnings.warn(
C:\Users\micha\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:700: UserWarning: The least populated class in y has only 2 members, which is less than n_splits=4.
  warnings.warn(
C:\Users\micha\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:700: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=4.
  warnings.warn(
C:\Users\micha\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:700: UserWarning: The least populated class in y has only 2 members, which is less than n_splits=4.
  warnings.warn(
C:\Users\micha\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:700: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=4.
  warnings.warn(
C:\Users\micha\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:700: UserWarning: The least populated class in y has only 2 members, which is less than n_splits=4.
  warnings.warn(
C:\Users\micha\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:700: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=4.
  warnings.warn(
C:\Users\micha\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:700: UserWarning: The least populated class in y has only 2 members, which is less than n_splits=4.
  warnings.warn(
C:\Users\micha\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:700: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=4.
  warnings.warn(
C:\Users\micha\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:700: UserWarning: The least populated class in y has only 2 members, which is less than n_splits=4.
  warnings.warn(
C:\Users\micha\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:700: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=4.
  warnings.warn(
C:\Users\micha\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:700: UserWarning: The least populated class in y has only 2 members, which is less than n_splits=4.
  warnings.warn(
C:\Users\micha\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:700: UserWarning: The least populated class in y has only 2 members, which is less than n_splits=4.
  warnings.warn(
C:\Users\micha\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:700: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=4.
  warnings.warn(
C:\Users\micha\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:700: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=4.
  warnings.warn(
C:\Users\micha\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:700: UserWarning: The least populated class in y has only 2 members, which is less than n_splits=4.
  warnings.warn(
C:\Users\micha\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:700: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=4.
  warnings.warn(
C:\Users\micha\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:700: UserWarning: The least populated class in y has only 2 members, which is less than n_splits=4.
  warnings.warn(
C:\Users\micha\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:700: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=4.
  warnings.warn(
C:\Users\micha\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:700: UserWarning: The least populated class in y has only 2 members, which is less than n_splits=4.
  warnings.warn(
C:\Users\micha\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:700: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=4.
  warnings.warn(
C:\Users\micha\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:700: UserWarning: The least populated class in y has only 2 members, which is less than n_splits=4.
  warnings.warn(
C:\Users\micha\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:700: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=4.
  warnings.warn(
C:\Users\micha\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:700: UserWarning: The least populated class in y has only 2 members, which is less than n_splits=4.
  warnings.warn(
C:\Users\micha\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:700: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=4.
  warnings.warn(
C:\Users\micha\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:700: UserWarning: The least populated class in y has only 2 members, which is less than n_splits=4.
  warnings.warn(
C:\Users\micha\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:700: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=4.
  warnings.warn(
C:\Users\micha\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:700: UserWarning: The least populated class in y has only 2 members, which is less than n_splits=4.
  warnings.warn(
C:\Users\micha\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:700: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=4.
  warnings.warn(
C:\Users\micha\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:700: UserWarning: The least populated class in y has only 2 members, which is less than n_splits=4.
  warnings.warn(
C:\Users\micha\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:700: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=4.
  warnings.warn(
C:\Users\micha\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:700: UserWarning: The least populated class in y has only 2 members, which is less than n_splits=4.
  warnings.warn(
Predictive Power Score Matrix:
No description has been provided for this image
In [26]:
import datetime as dt

data['ISSUE_DATEDATE_1'] = data['ISSUE_DATE_1'].dt.strftime('%m-%d')

Ouliers Treatment¶

In [27]:
data = data[data.ANNUAL_INCOME != 50000000.0]
In [28]:
plt.scatter(data['ANNUALIZED_PREMIUM_1'], data['TOTAL_COVERAGE'])
plt.show()
No description has been provided for this image

Final Data¶

In [29]:
data
Out[29]:
POLICY_ID CLIENT_ID ISSUE_DATE_1 PRODUCT_CODE_1 PRODUCT_TYPE RIDER_COUNT POLICY_OWNERSHIP ISSUE_DATE_2 PRODUCT_CODE_2 PURCHASE_PERIOD TARGET GENDER DOB MARITAL_STATUS LOCATION NATIONALITY ANNUAL_INCOME PAYMENT_METHOD CHANNEL TOTAL_COVERAGE BENEFICIARY_TYPE_PRIMARY BENEFICIARY_TYPE_SECONDARY ANNUALIZED_PREMIUM_1 ANNUALIZED_PREMIUM_2 ISSUE_DATEDATE_1
0 02015000024 12015010024 2015-01-03 P12 Traditional 3 2 2016-11-16 P08 22.4 No Male 2004-02-12 Single BATANGAS PH 1000900.0 Direct Other 2700000.0 2 0 1601500.0 150000.0 01-03
1 02015000062 12015010062 2015-01-02 P09 Unit-Linked 2 2 2019-02-25 P07 49.8 No Male 1985-01-06 Divorced QUEZON PH 180000.0 Direct Agent Partner 1050000.0 1 1 32400.0 14400.0 01-02
2 02015000087 12015010087 2015-01-09 P12 Traditional 2 2 2015-11-06 P08 9.9 No Male 2002-10-19 Married RIZAL PH 1000000.0 Direct Agent Staff 5890900.0 1 1 180000.0 130000.0 01-09
3 02015000088 12015010088 2015-01-12 P12 Traditional 2 2 2017-12-07 P08 34.8 No Male 1982-08-24 Divorced MANILA PH 5000000.0 Direct Agent Staff 45342100.0 1 1 7500000.0 350000.0 01-12
4 02018002785 12018040234 2018-04-26 P09 Unit-Linked 2 2 2018-09-08 P07 4.4 Yes Female 1996-08-30 Married ORIENTAL MINDORO PH 1000000.0 Direct Agent Partner 5200000.0 1 1 80000.0 110000.0 04-26
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
745 02019009354 12019080681 2019-08-28 P17 Traditional 4 2 2019-10-30 P03 2.1 Yes Female 1999-05-28 Single ISABELA PH 180000.0 Single Premium Agent Partner 1000000.0 1 1 32400.0 216000.0 08-28
746 02019010240 12019090303 2019-09-16 P09 Unit-Linked 1 2 2019-11-25 P07 2.3 Yes Female 2003-06-28 Separated MANILA PH 200000.0 Direct Agent Partner 3000000.0 1 1 39600.0 16000.0 09-16
747 02019011032 12019090845 2019-09-26 P17 Traditional 4 2 2019-12-20 P03 2.8 Yes Male 1990-04-15 Single CAMARINES SUR PH 1000000.0 Direct Broker 3178000.0 1 1 70000.0 140400.0 09-26
748 02019011802 12019100503 2019-10-16 P09 Unit-Linked 3 2 2019-11-29 P07 1.4 Yes Male 1992-07-05 Single METRO MANILA PH 600000.0 Single Premium Broker 1500000.0 1 1 30000.0 102000.0 10-16
749 02019012018 12019100658 2019-10-16 P12 Traditional 4 2 2019-11-14 P08 1.0 Yes Male 1998-05-13 Married MANILA PH 1700000.0 Single Premium Agent Partner 3006200.0 2 0 153200.0 102000.0 10-16

747 rows × 25 columns

EDA¶

Berikut merupakan EDA yang dapat dikatakan cukup lengkap yang dihasilkan oleh ydata_profiling. Variabel-variabel yang akan dipakai untuk pemodelan menunjukkan variasi yang cukup bagus dan tidak uniform.

In [30]:
from ydata_profiling import ProfileReport

profile = ProfileReport(data, title = 'Data Profiling Report', explorative = False, minimal = True)
profile
C:\Users\micha\anaconda3\lib\site-packages\numba\core\decorators.py:262: NumbaDeprecationWarning: numba.generated_jit is deprecated. Please see the documentation at: https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-generated-jit for more information and advice on a suitable replacement.
  warnings.warn(msg, NumbaDeprecationWarning)
C:\Users\micha\AppData\Roaming\Python\Python39\site-packages\visions\backends\shared\nan_handling.py:51: NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
  def hasna(x: np.ndarray) -> bool:
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[30]:

In [31]:
frequency_mapping = {
    'Annually': 1,
    'Semi-Annually': 2,
    'Quarterly': 4,
    'Monthly': 12
}

data3['TERM_MAPPED'] = data3['PAYMENT_TERM'].map(frequency_mapping).astype(int)
data3['ANNUALIZED_PREMIUM'] = data3['TERM_MAPPED'] * data3['PREMIUM']
data3.head()
Out[31]:
POLICY_ID CLIENT_ID APPLICATION_DATE ISSUE_DATE POLICY_STATUS PRODUCT_CODE PRODUCT_TYPE PAYMENT_TYPE PAYMENT_TERM PREMIUM RIDER_COUNT AGENT_ID POLICY_OWNERSHIP PURCHASE_ORDER TERM_MAPPED ANNUALIZED_PREMIUM
0 02015000001 12015010001 2014-12-02 2015-01-01 Inforce P01 Unit-Linked Single Annually 4061000.0 5 3201207023 1 1 1 4061000.0
1 02015000002 12015010002 2014-12-02 2015-01-01 Inforce P02 Unit-Linked Regular Quarterly 31300.0 4 3201302026 1 1 4 125200.0
2 02015000003 12015010003 2014-12-04 2015-01-03 Inforce P03 Unit-Linked Single Annually 1400000.0 2 3201310043 3 1 1 1400000.0
3 02015005464 12015010003 2015-10-25 2015-11-01 Inforce P03 Unit-Linked Regular Quarterly 37500.0 3 3201306038 3 2 4 150000.0
4 02016000731 12015010003 2016-01-22 2016-02-03 Inforce P03 Unit-Linked Regular Semi-Annually 55000.0 1 3201209034 3 3 2 110000.0
In [32]:
data3['ANNUALIZED_PREMIUM'].max()
Out[32]:
4939212500.0
In [33]:
data3['Year'] = data3.ISSUE_DATE.dt.year
In [34]:
cols = {
    'Year': 'category'
}

data3= data3.astype(cols)
In [35]:
data4 = data3[data3.ANNUALIZED_PREMIUM <= 412500.0]

Berikut merupakan persebaran dari premi tahunan per produk.

In [36]:
sns.boxplot(x=data4['PRODUCT_CODE'], y=data4['ANNUALIZED_PREMIUM'])
Out[36]:
<Axes: xlabel='PRODUCT_CODE', ylabel='ANNUALIZED_PREMIUM'>
No description has been provided for this image
In [37]:
sns.distplot(data3['ANNUALIZED_PREMIUM'], hist=True, kde=True, 
             bins=int(180/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4})
C:\Users\micha\AppData\Local\Temp\ipykernel_16844\1832918730.py:1: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(data3['ANNUALIZED_PREMIUM'], hist=True, kde=True,
Out[37]:
<Axes: xlabel='ANNUALIZED_PREMIUM', ylabel='Density'>
No description has been provided for this image
In [38]:
mean_ = data3.groupby('Year').mean()

mean_['ANNUALIZED_PREMIUM'].plot(kind='bar', title='Mean', ylabel='Mean Premium',
         xlabel='Year', figsize=(6, 5))
C:\Users\micha\AppData\Local\Temp\ipykernel_16844\577066128.py:1: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  mean_ = data3.groupby('Year').mean()
Out[38]:
<Axes: title={'center': 'Mean'}, xlabel='Year', ylabel='Mean Premium'>
No description has been provided for this image
In [39]:
alt.data_transformers.disable_max_rows()

pcode_base = alt.Chart(data, title='Total Policies and Average Annualized Premium By Product 2 (P03,P07,P08)').encode(
    alt.X('PRODUCT_CODE_2:N', title='Products'),
    tooltip=[alt.Tooltip('count(PRODUCT_ID):Q', title='Total Products Sold'),
             alt.Tooltip('average(PURCHASE_PERIOD):Q', title='Average Policy Processing Period (days)'),
             alt.Tooltip('average(ANNUALIZED_PREMIUM_2):Q', title='Average Annual Premium'),],
)

pcode_bar = pcode_base.mark_bar(opacity=0.85).encode(
    alt.Y('count(POLICY_ID):Q', axis=alt.Axis(title='Total Product Sold', titleColor='#f2903a')),
    color=alt.Color('TARGET:N', scale=alt.Scale(
        scheme='goldred'), legend=alt.Legend(title='Target')),
)

pcode_line = pcode_base.mark_line(stroke='#800020', interpolate='monotone').encode(
    alt.Y('average(ANNUALIZED_PREMIUM_2)', axis=alt.Axis(title='Average Annualized Premium', titleColor='#800020'))
)

pcode_point = pcode_line.mark_circle(color='#800020')

alt.layer(pcode_bar, pcode_line + pcode_point).resolve_scale(
    y='independent'
).properties(
    width=600,
    height=200
)
Out[39]:

==============================================================================================================================

Bar: menunjukkan total produk asuransi yang terjual untuk setiap produk berdasarkan kode produk kedua.

Dot: pada grafik garis menampilkan nilai rata-rata premi tahunan pada masing-masing product type.

==============================================================================================================================

In [40]:
cnat_base = alt.Chart(data, title='Average Annual Income and Total Coverage By Client\'s Nationality').encode(
    alt.X('NATIONALITY:N', title='Nationality'),
)

cnat_bar = cnat_base.mark_bar(color='#800020').encode(
    alt.Y('average(ANNUAL_INCOME):Q', axis=alt.Axis(title='Annual Income',
          titleColor='#800020')),
)

cnat_line = cnat_base.mark_line(stroke='goldenrod', interpolate='monotone').encode(
    alt.Y('count(CLIENT_ID):Q', axis = alt.Axis(title='Total Client',
                                                     titleColor='goldenrod')
))

cnat_point = cnat_line.mark_circle(color='goldenrod')

alt.layer(cnat_bar, cnat_line + cnat_point).resolve_scale(
    y='independent'
).properties(
    width=600,
    height=300
)
Out[40]:

==============================================================================================================================

Bar: menunjukkan rata-rata pendapatan tahunan untuk setiap kewarganegaraan..

Line: menampilkan jumlah total klien untuk setiap kewarganegaraan.

Dote: menyoroti jumlah total klien pada titik-titik data tertentu.

==============================================================================================================================

Transformasi Data¶

Dapat dilihat pada 4 grafik distribusi dibawah, persebaran value pada kolom ANNUAL_INCOME, TOTAL_COVERAGE, ANNUALIZED_PREMIUM_1, dan ANNUALIZED_PREMIUM_2 berbentuk right skew. Oleh karena itu, data-data tersebut ditransformasi secara logaritmik.

In [41]:
fig, axes = plt.subplots(1, 4, figsize=(20, 6))

sns.distplot(data['ANNUAL_INCOME'], hist=True, kde=True, 
             bins=int(180/5), color='darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, ax=axes[0])
axes[0].set_title('Annual Income')

sns.distplot(data['TOTAL_COVERAGE'], hist=True, kde=True, 
             bins=int(180/5), color='darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, ax=axes[1])
axes[1].set_title('Total Coverage')

sns.distplot(data['ANNUALIZED_PREMIUM_1'], hist=True, kde=True, 
             bins=int(180/5), color='darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, ax=axes[2])
axes[2].set_title('Annualized Premium 1')

sns.distplot(data['ANNUALIZED_PREMIUM_2'], hist=True, kde=True, 
             bins=int(180/5), color='darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, ax=axes[3])
axes[3].set_title('Annualized Premium 2')

plt.tight_layout()  # Adjust spacing between subplots
plt.show()
C:\Users\micha\AppData\Local\Temp\ipykernel_16844\3971122945.py:3: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(data['ANNUAL_INCOME'], hist=True, kde=True,
C:\Users\micha\AppData\Local\Temp\ipykernel_16844\3971122945.py:9: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(data['TOTAL_COVERAGE'], hist=True, kde=True,
C:\Users\micha\AppData\Local\Temp\ipykernel_16844\3971122945.py:15: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(data['ANNUALIZED_PREMIUM_1'], hist=True, kde=True,
C:\Users\micha\AppData\Local\Temp\ipykernel_16844\3971122945.py:21: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(data['ANNUALIZED_PREMIUM_2'], hist=True, kde=True,
No description has been provided for this image
In [42]:
data['ANNUAL_INCOME'] = np.log(data['ANNUAL_INCOME'])
data['TOTAL_COVERAGE'] = np.log(data['TOTAL_COVERAGE'])
data['ANNUALIZED_PREMIUM_1'] = np.log(data['ANNUALIZED_PREMIUM_1'])
data['ANNUALIZED_PREMIUM_2'] = np.log(data['ANNUALIZED_PREMIUM_2'])
C:\Users\micha\AppData\Local\Temp\ipykernel_16844\2103050638.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['ANNUAL_INCOME'] = np.log(data['ANNUAL_INCOME'])
C:\Users\micha\AppData\Local\Temp\ipykernel_16844\2103050638.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['TOTAL_COVERAGE'] = np.log(data['TOTAL_COVERAGE'])
C:\Users\micha\AppData\Local\Temp\ipykernel_16844\2103050638.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['ANNUALIZED_PREMIUM_1'] = np.log(data['ANNUALIZED_PREMIUM_1'])
C:\Users\micha\anaconda3\lib\site-packages\pandas\core\arraylike.py:402: RuntimeWarning: divide by zero encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)
C:\Users\micha\AppData\Local\Temp\ipykernel_16844\2103050638.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['ANNUALIZED_PREMIUM_2'] = np.log(data['ANNUALIZED_PREMIUM_2'])

Dapat dilihat, data setelah transformasi mengikuti curve bell yang lebih mirip dengan distribusi normal.

In [43]:
data['ANNUALIZED_PREMIUM_2'] = data['ANNUALIZED_PREMIUM_2'].replace(-np.inf, 0)

fig, axes = plt.subplots(1, 4, figsize=(20, 6))  # 1 row, 4 columns

sns.distplot(data['ANNUAL_INCOME'], hist=True, kde=True, 
             bins=int(180/5), color='darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, ax=axes[0])
axes[0].set_title('Annual Income')

sns.distplot(data['TOTAL_COVERAGE'], hist=True, kde=True, 
             bins=int(180/5), color='darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, ax=axes[1])
axes[1].set_title('Total Coverage')

sns.distplot(data['ANNUALIZED_PREMIUM_1'], hist=True, kde=True, 
             bins=int(180/5), color='darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, ax=axes[2])
axes[2].set_title('Annualized Premium 1')

sns.distplot(data['ANNUALIZED_PREMIUM_2'], hist=True, kde=True, 
             bins=int(180/5), color='darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, ax=axes[3])
axes[3].set_title('Annualized Premium 2')

plt.tight_layout()
plt.show()
C:\Users\micha\AppData\Local\Temp\ipykernel_16844\3259071580.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['ANNUALIZED_PREMIUM_2'] = data['ANNUALIZED_PREMIUM_2'].replace(-np.inf, 0)
C:\Users\micha\AppData\Local\Temp\ipykernel_16844\3259071580.py:5: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(data['ANNUAL_INCOME'], hist=True, kde=True,
C:\Users\micha\AppData\Local\Temp\ipykernel_16844\3259071580.py:11: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(data['TOTAL_COVERAGE'], hist=True, kde=True,
C:\Users\micha\AppData\Local\Temp\ipykernel_16844\3259071580.py:17: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(data['ANNUALIZED_PREMIUM_1'], hist=True, kde=True,
C:\Users\micha\AppData\Local\Temp\ipykernel_16844\3259071580.py:23: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(data['ANNUALIZED_PREMIUM_2'], hist=True, kde=True,
No description has been provided for this image
In [44]:
data['ANNUALIZED_PREMIUM_2'] = data['ANNUALIZED_PREMIUM_2'].replace(0, -np.inf)
C:\Users\micha\AppData\Local\Temp\ipykernel_16844\2178044227.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['ANNUALIZED_PREMIUM_2'] = data['ANNUALIZED_PREMIUM_2'].replace(0, -np.inf)
In [45]:
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 747 entries, 0 to 749
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   POLICY_ID                   747 non-null    object        
 1   CLIENT_ID                   747 non-null    object        
 2   ISSUE_DATE_1                747 non-null    datetime64[ns]
 3   PRODUCT_CODE_1              747 non-null    category      
 4   PRODUCT_TYPE                747 non-null    category      
 5   RIDER_COUNT                 747 non-null    int32         
 6   POLICY_OWNERSHIP            747 non-null    int32         
 7   ISSUE_DATE_2                747 non-null    datetime64[ns]
 8   PRODUCT_CODE_2              747 non-null    object        
 9   PURCHASE_PERIOD             747 non-null    float64       
 10  TARGET                      747 non-null    category      
 11  GENDER                      747 non-null    category      
 12  DOB                         747 non-null    datetime64[ns]
 13  MARITAL_STATUS              747 non-null    category      
 14  LOCATION                    747 non-null    object        
 15  NATIONALITY                 747 non-null    category      
 16  ANNUAL_INCOME               747 non-null    float64       
 17  PAYMENT_METHOD              747 non-null    category      
 18  CHANNEL                     747 non-null    category      
 19  TOTAL_COVERAGE              745 non-null    float64       
 20  BENEFICIARY_TYPE_PRIMARY    747 non-null    int32         
 21  BENEFICIARY_TYPE_SECONDARY  747 non-null    int32         
 22  ANNUALIZED_PREMIUM_1        747 non-null    float64       
 23  ANNUALIZED_PREMIUM_2        747 non-null    float64       
 24  ISSUE_DATEDATE_1            747 non-null    object        
dtypes: category(8), datetime64[ns](3), float64(5), int32(4), object(5)
memory usage: 101.3+ KB

SPLITTING DATA¶

Selanjutnya merupakan proses splitting data untuk training dan testing. Data training yang dipakai merupakan data yang dimana waktu pengeluaran produk ke dua merentang dari tahun 2015 hingga akhir 2018. Data yang digunakan untuk testing adalah sisanya (data dimana produk ke dua memiliki waktu pengeluaran pada tahun 2019 keatas).

In [46]:
test = data.loc[(data['ISSUE_DATE_2'] >= '2019-01-01')]
train = data.loc[(data['ISSUE_DATE_2'] < '2019-01-01')]

test.reset_index(inplace=True)
train.reset_index(inplace=True)
In [47]:
import h2o
h2o.init()
Checking whether there is an H2O instance running at http://localhost:54321. connected.
H2O_cluster_uptime: 20 mins 05 secs
H2O_cluster_timezone: Asia/Bangkok
H2O_data_parsing_timezone: UTC
H2O_cluster_version: 3.42.0.2
H2O_cluster_version_age: 1 month and 7 days
H2O_cluster_name: H2O_from_python_micha_jgmwyc
H2O_cluster_total_nodes: 1
H2O_cluster_free_memory: 190.5 Mb
H2O_cluster_total_cores: 8
H2O_cluster_allowed_cores: 8
H2O_cluster_status: locked, healthy
H2O_connection_url: http://localhost:54321
H2O_connection_proxy: {"http": null, "https": null}
H2O_internal_security: False
Python_version: 3.9.16 final
In [48]:
from h2o.automl import H2OAutoML

Create Fold¶

Dibawah merupakan proses pembuatan fold untuk proses cross validation pada data training. Kelompok kami menggunakan 5 fold untuk proses cross validation. Pembuatan fold dilakukan 2 kali untuk proses training model Propensity to Buy dan model Product Recommendation.

In [49]:
seed = 11
np.random.seed(seed)

cv_fold1 = train[['TARGET']].copy()
cv_fold1.reset_index(inplace=True)
cv_fold1['Random'] = np.random.random(len(train))

cv_fold1.sort_values(by=["TARGET", 'Random'], inplace=True, ignore_index=True)

kfold = 5
cv_fold1['CV_Fold_1'] = cv_fold1.index % kfold + 1

cv_fold1.sort_values(by='index', inplace=True, ignore_index=True)

train1 = train.assign(CV_Fold_1 = cv_fold1['CV_Fold_1'])
train1.groupby(['CV_Fold_1', 'TARGET']).agg(Count = ('CLIENT_ID','count'))

del cv_fold1

#train1.groupby(['CV_Fold_1', 'TARGET']).agg(Count = ('CLIENT_ID', 'count'))
In [50]:
seed = 11
np.random.seed(seed)

cv_fold = train[['PRODUCT_CODE_2']].copy()
cv_fold.reset_index(inplace=True)
cv_fold['Random'] = np.random.random(len(train))

cv_fold.sort_values(by=["PRODUCT_CODE_2", 'Random'], inplace=True, ignore_index=True)

kfold = 5
cv_fold['CV_Fold'] = cv_fold.index % kfold + 1

cv_fold.sort_values(by='index', inplace=True, ignore_index=True)

train = train.assign(CV_Fold = cv_fold['CV_Fold'])
train.groupby(['CV_Fold', 'PRODUCT_CODE_2']).agg(Count = ('CLIENT_ID','count'))

del cv_fold

MODEL 1 (TARGET ~ Propensity to Buy)¶

Dibawah merupakan proses pembuatan model pertama. Kelompok kami menggunakan model yang terbaik yang dihasilkan oleh fungsi AutoML dari package H2O.

In [51]:
start_time = pd.to_datetime('now', utc=True) + pd.Timedelta('07:00:00')

seed = 11
train_frame = h2o.H2OFrame(train1)

train_frame['TARGET'] = train_frame['TARGET']

timer(start_time, header='\nConverting DataFrame to H2O Frame')
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


Converting DataFrame to H2O Frame:
Start   : 2023-09-01 10:11:44 PM
Finish  : 2023-09-01 10:11:55 PM
Runtime : 00:00:11 

In [52]:
variables = ['PRODUCT_CODE_1',
             'ANNUALIZED_PREMIUM_1',
             'RIDER_COUNT',
             'GENDER',
             'MARITAL_STATUS',
             'NATIONALITY',
             'LOCATION',
             'ISSUE_DATE_1',
             'ANNUAL_INCOME',
             'PAYMENT_METHOD',
             'TOTAL_COVERAGE',
            ]
target = 'TARGET'

runtime = 60
algos = ['GLM', 'DRF', 'GBM', 'DeepLearning']
AML = H2OAutoML(max_runtime_secs=runtime, include_algos=algos, seed=seed)
AML.train(x=variables, y=target, training_frame=train_frame, fold_column = 'CV_Fold_1')

AML.leaderboard
AutoML progress: |█
22:12:02.484: Fold column CV_Fold_1 will be used for cross-validation. nfolds parameter will be ignored.

██████████████████████████████████████████████████████████████| 100%e) 100%
Out[52]:
model_id auc logloss aucpr mean_per_class_error rmse mse
GBM_grid_1_AutoML_4_20230901_221202_model_130.730498 0.4857410.537122 0.3159690.3927670.154266
GBM_grid_1_AutoML_4_20230901_221202_model_110.727126 0.4865940.481736 0.3130690.3959350.156765
GBM_grid_1_AutoML_4_20230901_221202_model_9 0.71793 0.4912020.520054 0.3232570.39666 0.157339
GBM_3_AutoML_4_20230901_221202 0.705436 0.4914870.523976 0.3178780.39515 0.156143
GBM_4_AutoML_4_20230901_221202 0.7038 0.5066980.481991 0.3398650.4044050.163544
GLM_1_AutoML_4_20230901_221202 0.700441 0.4958780.522456 0.3328380.3990790.159264
GBM_grid_1_AutoML_4_20230901_221202_model_1 0.698148 0.5054140.446058 0.3409560.4058240.164693
GBM_2_AutoML_4_20230901_221202 0.697938 0.4994960.510877 0.3394930.3982920.158636
GBM_grid_1_AutoML_4_20230901_221202_model_120.695781 0.4948470.491324 0.3389480.3993450.159477
GBM_grid_1_AutoML_4_20230901_221202_model_4 0.68765 0.5090720.399718 0.3325520.4077710.166277
[25 rows x 7 columns]
In [53]:
model0 = AML.leader
model0
Out[53]:
Model Details
=============
H2OGradientBoostingEstimator : Gradient Boosting Machine
Model Key: GBM_grid_1_AutoML_4_20230901_221202_model_13
Model Summary:
number_of_trees number_of_internal_trees model_size_in_bytes min_depth max_depth mean_depth min_leaves max_leaves mean_leaves
26.0 26.0 10900.0 8.0 14.0 10.769231 22.0 32.0 28.461538
ModelMetricsBinomial: gbm
** Reported on train data. **

MSE: 0.04808370779727791
RMSE: 0.21927997582378084
LogLoss: 0.19875260106147147
Mean Per-Class Error: 0.017252918866661707
AUC: 0.9973971889640812
AUCPR: 0.9910658750762342
Gini: 0.9947943779281625
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.35267457830370236
No Yes Error Rate
No 351.0 6.0 0.0168 (6.0/357.0)
Yes 2.0 111.0 0.0177 (2.0/113.0)
Total 353.0 117.0 0.017 (8.0/470.0)
Maximum Metrics: Maximum metrics at their respective thresholds
metric threshold value idx
max f1 0.3526746 0.9652174 114.0
max f2 0.2738367 0.9792028 122.0
max f0point5 0.4160493 0.9646018 110.0
max accuracy 0.4160493 0.9829787 110.0
max precision 0.9437941 1.0 0.0
max recall 0.2738367 1.0 122.0
max specificity 0.9437941 1.0 0.0
max absolute_mcc 0.3526746 0.9542072 114.0
max min_per_class_accuracy 0.3526746 0.9823009 114.0
max mean_per_class_accuracy 0.3037659 0.9843707 117.0
max tns 0.9437941 357.0 0.0
max fns 0.9437941 112.0 0.0
max fps 0.0181360 357.0 399.0
max tps 0.2738367 113.0 122.0
max tnr 0.9437941 1.0 0.0
max fnr 0.9437941 0.9911504 0.0
max fpr 0.0181360 1.0 399.0
max tpr 0.2738367 1.0 122.0
Gains/Lift Table: Avg response rate: 24.04 %, avg score: 24.04 %
group cumulative_data_fraction lower_threshold lift cumulative_lift response_rate score cumulative_response_rate cumulative_score capture_rate cumulative_capture_rate gain cumulative_gain kolmogorov_smirnov
1 0.0106383 0.9213609 4.1592920 4.1592920 1.0 0.9338186 1.0 0.9338186 0.0442478 0.0442478 315.9292035 315.9292035 0.0442478
2 0.0212766 0.8961901 4.1592920 4.1592920 1.0 0.9056798 1.0 0.9197492 0.0442478 0.0884956 315.9292035 315.9292035 0.0884956
3 0.0319149 0.8626197 4.1592920 4.1592920 1.0 0.8758808 1.0 0.9051264 0.0442478 0.1327434 315.9292035 315.9292035 0.1327434
4 0.0404255 0.8410511 4.1592920 4.1592920 1.0 0.8526109 1.0 0.8940705 0.0353982 0.1681416 315.9292035 315.9292035 0.1681416
5 0.0510638 0.8196106 4.1592920 4.1592920 1.0 0.8285709 1.0 0.8804247 0.0442478 0.2123894 315.9292035 315.9292035 0.2123894
6 0.1 0.7158527 4.1592920 4.1592920 1.0 0.7729012 1.0 0.8278069 0.2035398 0.4159292 315.9292035 315.9292035 0.4159292
7 0.1510638 0.5836152 4.1592920 4.1592920 1.0 0.6550671 1.0 0.7694160 0.2123894 0.6283186 315.9292035 315.9292035 0.6283186
8 0.2 0.4807496 3.7976145 4.0707965 0.9130435 0.5249401 0.9787234 0.7095974 0.1858407 0.8141593 279.7614467 307.0796460 0.8085571
9 0.3 0.2207605 1.8584071 3.3333333 0.4468085 0.3433888 0.8014184 0.5875279 0.1858407 1.0 85.8407080 233.3333333 0.9215686
10 0.4 0.1555439 0.0 2.5 0.0 0.1821685 0.6010638 0.4861880 0.0 1.0 -100.0 150.0 0.7899160
11 0.5 0.1141355 0.0 2.0 0.0 0.1347484 0.4808511 0.4159001 0.0 1.0 -100.0 100.0 0.6582633
12 0.6 0.0904905 0.0 1.6666667 0.0 0.1017356 0.4007092 0.3635393 0.0 1.0 -100.0 66.6666667 0.5266106
13 0.7 0.0707116 0.0 1.4285714 0.0 0.0784595 0.3434650 0.3228136 0.0 1.0 -100.0 42.8571429 0.3949580
14 0.8 0.0544262 0.0 1.25 0.0 0.0626467 0.3005319 0.2902928 0.0 1.0 -100.0 25.0 0.2633053
15 0.9 0.0406389 0.0 1.1111111 0.0 0.0476981 0.2671395 0.2633378 0.0 1.0 -100.0 11.1111111 0.1316527
16 1.0 0.0181360 0.0 1.0 0.0 0.0339901 0.2404255 0.2404030 0.0 1.0 -100.0 0.0 0.0
ModelMetricsBinomial: gbm
** Reported on cross-validation data. **

MSE: 0.15426593388324308
RMSE: 0.39276702239781164
LogLoss: 0.48574078751400046
Mean Per-Class Error: 0.3159688654222751
AUC: 0.7304975087380084
AUCPR: 0.5371218347182841
Gini: 0.4609950174760169
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.3120699995981984
No Yes Error Rate
No 302.0 55.0 0.1541 (55.0/357.0)
Yes 54.0 59.0 0.4779 (54.0/113.0)
Total 356.0 114.0 0.2319 (109.0/470.0)
Maximum Metrics: Maximum metrics at their respective thresholds
metric threshold value idx
max f1 0.3120700 0.5198238 109.0
max f2 0.1268199 0.6330014 234.0
max f0point5 0.5279173 0.5481728 45.0
max accuracy 0.5778527 0.8 37.0
max precision 0.9145606 1.0 0.0
max recall 0.0341627 1.0 381.0
max specificity 0.9145606 1.0 0.0
max absolute_mcc 0.3120700 0.3669587 109.0
max min_per_class_accuracy 0.1816340 0.6637168 184.0
max mean_per_class_accuracy 0.3120700 0.6840311 109.0
max tns 0.9145606 357.0 0.0
max fns 0.9145606 112.0 0.0
max fps 0.0123395 357.0 399.0
max tps 0.0341627 113.0 381.0
max tnr 0.9145606 1.0 0.0
max fnr 0.9145606 0.9911504 0.0
max fpr 0.0123395 1.0 399.0
max tpr 0.0341627 1.0 381.0
Gains/Lift Table: Avg response rate: 24.04 %, avg score: 22.16 %
group cumulative_data_fraction lower_threshold lift cumulative_lift response_rate score cumulative_response_rate cumulative_score capture_rate cumulative_capture_rate gain cumulative_gain kolmogorov_smirnov
1 0.0106383 0.8776595 4.1592920 4.1592920 1.0 0.9006351 1.0 0.9006351 0.0442478 0.0442478 315.9292035 315.9292035 0.0442478
2 0.0212766 0.8388734 4.1592920 4.1592920 1.0 0.8555156 1.0 0.8780753 0.0442478 0.0884956 315.9292035 315.9292035 0.0884956
3 0.0319149 0.7995901 2.4955752 3.6047198 0.6 0.8102755 0.8666667 0.8554754 0.0265487 0.1150442 149.5575221 260.4719764 0.1094420
4 0.0404255 0.7771160 1.0398230 3.0647415 0.25 0.7854265 0.7368421 0.8407282 0.0088496 0.1238938 3.9823009 206.4741500 0.1098882
5 0.0510638 0.7046516 4.1592920 3.2927729 1.0 0.7461158 0.7916667 0.8210173 0.0442478 0.1681416 315.9292035 229.2772861 0.1541360
6 0.1 0.5273687 2.5317430 2.9203540 0.6086957 0.5970945 0.7021277 0.7114381 0.1238938 0.2920354 153.1742978 192.0353982 0.2528197
7 0.1510638 0.4309686 1.0398230 2.2846815 0.25 0.4809597 0.5492958 0.6335299 0.0530973 0.3451327 3.9823009 128.4681541 0.2554969
8 0.2 0.3572414 1.9892266 2.2123894 0.4782609 0.3925309 0.5319149 0.5745620 0.0973451 0.4424779 98.9226626 121.2389381 0.3192286
9 0.3 0.2517622 1.2389381 1.8879056 0.2978723 0.3034441 0.4539007 0.4841894 0.1238938 0.5663717 23.8938053 88.7905605 0.3506854
10 0.4 0.1900556 0.7964602 1.6150442 0.1914894 0.2221981 0.3882979 0.4186916 0.0796460 0.6460177 -20.3539823 61.5044248 0.3238888
11 0.5 0.1364229 0.8849558 1.4690265 0.2127660 0.1618478 0.3531915 0.3673228 0.0884956 0.7345133 -11.5044248 46.9026549 0.3087430
12 0.6 0.1061296 0.7964602 1.3569322 0.1914894 0.1215737 0.3262411 0.3263646 0.0796460 0.8141593 -20.3539823 35.6932153 0.2819464
13 0.7 0.0831462 0.4424779 1.2262958 0.1063830 0.0934024 0.2948328 0.2930843 0.0442478 0.8584071 -55.7522124 22.6295828 0.2085471
14 0.8 0.0662505 0.5309735 1.1393805 0.1276596 0.0744827 0.2739362 0.2657591 0.0530973 0.9115044 -46.9026549 13.9380531 0.1467985
15 0.9 0.0460618 0.5309735 1.0717797 0.1276596 0.0567810 0.2576832 0.2425393 0.0530973 0.9646018 -46.9026549 7.1779744 0.0850499
16 1.0 0.0123395 0.3539823 1.0 0.0851064 0.0333078 0.2404255 0.2216162 0.0353982 1.0 -64.6017699 0.0 0.0
Cross-Validation Metrics Summary:
mean sd cv_1_valid cv_2_valid cv_3_valid cv_4_valid cv_5_valid
accuracy 0.7106383 0.0875319 0.5851064 0.7765958 0.8085107 0.7021276 0.6808510
auc 0.7244744 0.0617560 0.6395202 0.6938131 0.7911819 0.7219841 0.7758727
err 0.2893617 0.0875319 0.4148936 0.2234043 0.1914894 0.2978723 0.3191489
err_count 27.2 8.228001 39.0 21.0 18.0 28.0 30.0
f0point5 0.4890643 0.0880588 0.3669725 0.5188679 0.610687 0.4748603 0.4739336
f1 0.5444295 0.0702472 0.4507042 0.5116279 0.64 0.5483871 0.5714286
f2 0.6258154 0.0835181 0.5839416 0.5045872 0.6722689 0.648855 0.7194245
lift_top_group 4.161265 0.1017508 4.2727275 4.2727275 4.0869565 4.0869565 4.0869565
logloss 0.4997030 0.0537379 0.5709812 0.5155625 0.4364802 0.5188571 0.4566342
max_per_class_error 0.3905644 0.0874945 0.4583333 0.5 0.3043478 0.3098592 0.3802817
mcc 0.3807987 0.1035108 0.2279438 0.3670687 0.5137458 0.3745513 0.4206838
mean_per_class_accuracy 0.7089328 0.0534582 0.6344697 0.6805556 0.7703613 0.7146356 0.7446418
mean_per_class_error 0.2910672 0.0534582 0.3655303 0.3194444 0.2296387 0.2853644 0.2553583
mse 0.1562454 0.0156168 0.1737971 0.1610780 0.1358629 0.1658903 0.1445988
pr_auc 0.5368065 0.0820717 0.4580024 0.4925370 0.6313899 0.4824174 0.6196859
precision 0.4608724 0.1015280 0.3265306 0.5238095 0.5925926 0.4358974 0.4255319
r2 0.1433611 0.0954488 0.0305105 0.1014612 0.2648596 0.1023838 0.2175905
recall 0.7063241 0.1330848 0.7272728 0.5 0.6956522 0.7391304 0.8695652
rmse 0.3948778 0.0199058 0.4168898 0.4013453 0.3685958 0.4072964 0.3802615
specificity 0.7115415 0.1395958 0.5416667 0.8611111 0.8450704 0.6901408 0.6197183
Scoring History:
timestamp duration number_of_trees training_rmse training_logloss training_auc training_pr_auc training_lift training_classification_error
2023-09-01 22:12:14 6.768 sec 0.0 0.4273419 0.5515699 0.5 0.2404255 1.0 0.7595745
2023-09-01 22:12:14 6.794 sec 5.0 0.3643319 0.4250903 0.9326492 0.8398545 4.1592920 0.1255319
2023-09-01 22:12:14 6.819 sec 10.0 0.3185555 0.3447529 0.9703899 0.9108737 4.1592920 0.0723404
2023-09-01 22:12:14 6.845 sec 15.0 0.2827596 0.2880355 0.9857961 0.9579168 4.1592920 0.0404255
2023-09-01 22:12:14 6.878 sec 20.0 0.2533978 0.2430484 0.9927121 0.9751680 4.1592920 0.0297872
2023-09-01 22:12:14 6.904 sec 25.0 0.2244603 0.2052331 0.9968394 0.9893688 4.1592920 0.0170213
2023-09-01 22:12:14 6.916 sec 26.0 0.2192800 0.1987526 0.9973972 0.9910659 4.1592920 0.0170213
Variable Importances:
variable relative_importance scaled_importance percentage
ISSUE_DATE_1 92.4556274 1.0 0.3079927
LOCATION 80.3664093 0.8692430 0.2677205
ANNUALIZED_PREMIUM_1 30.4027653 0.3288363 0.1012792
ANNUAL_INCOME 25.3543491 0.2742326 0.0844617
TOTAL_COVERAGE 20.6812611 0.2236885 0.0688944
MARITAL_STATUS 14.5560350 0.1574381 0.0484898
PAYMENT_METHOD 12.2580433 0.1325830 0.0408346
GENDER 6.9964027 0.0756731 0.0233068
PRODUCT_CODE_1 6.9548378 0.0752235 0.0231683
RIDER_COUNT 6.4212408 0.0694521 0.0213908
NATIONALITY 3.7407303 0.0404597 0.0124613

[tips]
Use `model.explain()` to inspect the model.
--
Use `h2o.display.toggle_user_tips()` to switch on/off this section.
In [54]:
threshold = model0.F1(xval=True)[0][0]
In [55]:
train['TARGET'].value_counts()
Out[55]:
No     357
Yes    113
Name: TARGET, dtype: int64

Model 1 Prediction¶

Dibawah merupakan proses dan hasil dari prediksi dari model diatas. Model tersebut menghasilkan nilai f1 0.41 pada data testing, nilai akurasi 0.65, serta conversion rate sebesar 0.48.

In [56]:
test_frame = h2o.H2OFrame(test)

pred=model0.predict(test_frame)
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
In [57]:
tgt = test_frame['TARGET'].as_data_frame()
tgt['TARGET'] = np.where(tgt['TARGET']=='No',0,1)
In [58]:
prob = pred['Yes'].as_data_frame()
prob['Predicted'] = np.where(prob['Yes']>threshold, 1,0)
In [59]:
tp = np.sum((prob['Predicted'] == 1) & (tgt['TARGET'] == 1))
fp = np.sum((prob['Predicted'] == 1) & (tgt['TARGET']== 0))
fn = np.sum((prob['Predicted'] == 0) & (tgt['TARGET'] == 1))
tn = np.sum((prob['Predicted'] == 0) & (tgt['TARGET'] == 0))
    
precision = tp / (tp + fp)
sensitivity = tp / (tp + fn)
f1 = 2 * (precision * sensitivity) / (precision + sensitivity)
acc = (tp + tn) / (tp + fp + tn + fn)
In [60]:
f1
Out[60]:
0.40963855421686746
In [61]:
acc
Out[61]:
0.6462093862815884
In [62]:
conversion_rate = np.sum(prob['Predicted'] == 1) / len(prob)
print("Conversion Rate:", conversion_rate)
Conversion Rate: 0.47653429602888087

MODEL 2 (PRODUCT RECOMENDATION)¶

Dibawah merupakan proses pembuatan model kedua. Metode yang dipakai sama dengan proses pembuatan model pertama. Model 2 dibuat untuk memberikan rekomendasi produk untuk pemegang polis yang sudah memiliki 1 polis. Produk asuransi yang akan ditawarkan kepada pemegang polis akan bergantung kepada beberapa variabel, salah satunya adalah polis pertama yang sudah dipegang.

In [63]:
start_time = pd.to_datetime('now', utc=True) + pd.Timedelta('07:00:00')

seed = 11
train_frame = h2o.H2OFrame(train)

train_frame['PRODUCT_CODE_2'] = train_frame['PRODUCT_CODE_2']

timer(start_time, header='\nConverting DataFrame to H2O Frame')
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


Converting DataFrame to H2O Frame:
Start   : 2023-09-01 10:19:58 PM
Finish  : 2023-09-01 10:20:08 PM
Runtime : 00:00:10 

In [64]:
variables = ['PRODUCT_CODE_1','ANNUALIZED_PREMIUM_1','RIDER_COUNT','GENDER',
             'MARITAL_STATUS','LOCATION','NATIONALITY',
             'ANNUAL_INCOME',
             'PAYMENT_METHOD','CHANNEL','TOTAL_COVERAGE']
target = 'PRODUCT_CODE_2'

runtime = 60
algos = ['GLM', 'DRF', 'GBM', 'XGBoost','DeepLearning']
AML2 = H2OAutoML(max_runtime_secs=runtime, include_algos=algos, seed=seed)
AML2.train(x=variables, y=target, training_frame=train_frame, fold_column = 'CV_Fold')

AML2.leaderboard
AutoML progress: |█
22:20:14.788: Fold column CV_Fold will be used for cross-validation. nfolds parameter will be ignored.
22:20:14.792: AutoML: XGBoost is not available; skipping it.
22:20:15.661: GLM_1_AutoML_5_20230901_222014 [GLM def_1] failed: java.lang.ArrayIndexOutOfBoundsException: 61

██████████████████████████████████████████████████████████████| (done) 100%
Out[64]:
model_id mean_per_class_error logloss rmse mse
GBM_grid_1_AutoML_5_20230901_222014_model_4 0.001366120.01449050.04834810.00233754
DeepLearning_grid_1_AutoML_5_20230901_222014_model_3 0.001366120.01568530.05719780.00327159
GBM_grid_1_AutoML_5_20230901_222014_model_3 0.001366120.01588660.049011 0.00240208
DeepLearning_grid_1_AutoML_5_20230901_222014_model_1 0.001366120.02931310.04612680.00212768
GBM_1_AutoML_5_20230901_222014 0.001366120.01269250.04670210.00218108
GBM_2_AutoML_5_20230901_222014 0.001366120.01356560.04705 0.0022137
GBM_grid_1_AutoML_5_20230901_222014_model_1 0.001366120.01311270.04645780.00215833
GBM_grid_1_AutoML_5_20230901_222014_model_2 0.001366120.01468490.04663260.0021746
GBM_4_AutoML_5_20230901_222014 0.001366120.01417360.04974650.00247471
DRF_1_AutoML_5_20230901_222014 0.001366120.08412410.107179 0.0114874
[19 rows x 5 columns]
In [65]:
model1 = AML2.leader
model1
Out[65]:
Model Details
=============
H2OGradientBoostingEstimator : Gradient Boosting Machine
Model Key: GBM_grid_1_AutoML_5_20230901_222014_model_4
Model Summary:
number_of_trees number_of_internal_trees model_size_in_bytes min_depth max_depth mean_depth min_leaves max_leaves mean_leaves
241.0 723.0 77082.0 2.0 3.0 2.5131397 3.0 4.0 3.735823
ModelMetricsMultinomial: gbm
** Reported on train data. **

MSE: 5.401455495894675e-08
RMSE: 0.00023241031594777964
LogLoss: 2.990926065172747e-05
Mean Per-Class Error: 0.0
AUC table was not computed: it is either disabled (model parameter 'auc_type' was set to AUTO or NONE) or the domain size exceeds the limit (maximum is 50 domains).
AUCPR table was not computed: it is either disabled (model parameter 'auc_type' was set to AUTO or NONE) or the domain size exceeds the limit (maximum is 50 domains).
Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
P03 P07 P08 Error Rate
87.0 0.0 0.0 0.0 0 / 87
0.0 244.0 0.0 0.0 0 / 244
0.0 0.0 139.0 0.0 0 / 139
87.0 244.0 139.0 0.0 0 / 470
Top-3 Hit Ratios:
k hit_ratio
1 1.0
2 1.0
3 1.0
ModelMetricsMultinomial: gbm
** Reported on cross-validation data. **

MSE: 0.002337541808052496
RMSE: 0.04834813138118676
LogLoss: 0.01449046844376076
Mean Per-Class Error: 0.001366120218579235
AUC table was not computed: it is either disabled (model parameter 'auc_type' was set to AUTO or NONE) or the domain size exceeds the limit (maximum is 50 domains).
AUCPR table was not computed: it is either disabled (model parameter 'auc_type' was set to AUTO or NONE) or the domain size exceeds the limit (maximum is 50 domains).
Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
P03 P07 P08 Error Rate
87.0 0.0 0.0 0.0 0 / 87
1.0 243.0 0.0 0.0040984 1 / 244
0.0 0.0 139.0 0.0 0 / 139
88.0 243.0 139.0 0.0021277 1 / 470
Top-3 Hit Ratios:
k hit_ratio
1 0.9978723
2 0.9978723
3 1.0
Cross-Validation Metrics Summary:
mean sd cv_1_valid cv_2_valid cv_3_valid cv_4_valid cv_5_valid
accuracy 0.9978723 0.0047576 1.0 1.0 1.0 0.9893617 1.0
auc nan 0.0 nan nan nan nan nan
err 0.0021277 0.0047576 0.0 0.0 0.0 0.0106383 0.0
err_count 0.2 0.4472136 0.0 0.0 0.0 1.0 0.0
logloss 0.0149537 0.0288870 0.0000297 0.0083110 0.0001683 0.0662330 0.0000267
max_per_class_error 0.0040816 0.0091268 0.0 0.0 0.0 0.0204082 0.0
mean_per_class_accuracy 0.9986395 0.0030423 1.0 1.0 1.0 0.9931973 1.0
mean_per_class_error 0.0013605 0.0030423 0.0 0.0 0.0 0.0068027 0.0
mse 0.0023244 0.0046636 0.0000000 0.0009893 0.0000012 0.0106316 0.0000000
pr_auc nan 0.0 nan nan nan nan nan
r2 0.9950131 0.0100327 0.9999999 0.9979306 0.9999974 0.9771377 0.9999999
rmse 0.0271903 0.0445129 0.0001243 0.0314527 0.0010872 0.1031098 0.0001775
Scoring History:
timestamp duration number_of_trees training_rmse training_logloss training_classification_error training_auc training_pr_auc
2023-09-01 22:21:02 20.766 sec 0.0 0.6666667 1.0986123 0.6042553 nan nan
2023-09-01 22:21:02 20.787 sec 5.0 0.5258189 0.7480497 0.0531915 nan nan
2023-09-01 22:21:02 20.798 sec 10.0 0.4122763 0.5274643 0.0276596 nan nan
2023-09-01 22:21:02 20.811 sec 15.0 0.3309825 0.3890578 0.0106383 nan nan
2023-09-01 22:21:02 20.824 sec 20.0 0.2470086 0.2634297 0.0021277 nan nan
2023-09-01 22:21:02 20.837 sec 25.0 0.1836921 0.1798077 0.0021277 nan nan
2023-09-01 22:21:02 20.858 sec 30.0 0.1401519 0.1277112 0.0021277 nan nan
2023-09-01 22:21:02 20.871 sec 35.0 0.0972901 0.0804222 0.0021277 nan nan
2023-09-01 22:21:02 20.886 sec 40.0 0.0739156 0.0548825 0.0021277 nan nan
2023-09-01 22:21:02 20.899 sec 45.0 0.0613434 0.0402589 0.0021277 nan nan
--- --- --- --- --- --- --- --- ---
2023-09-01 22:21:02 21.402 sec 200.0 0.0008427 0.0001134 0.0 nan nan
2023-09-01 22:21:02 21.424 sec 205.0 0.0006936 0.0000941 0.0 nan nan
2023-09-01 22:21:02 21.440 sec 210.0 0.0005894 0.0000806 0.0 nan nan
2023-09-01 22:21:02 21.456 sec 215.0 0.0005252 0.0000673 0.0 nan nan
2023-09-01 22:21:02 21.473 sec 220.0 0.0004346 0.0000579 0.0 nan nan
2023-09-01 22:21:02 21.489 sec 225.0 0.0003819 0.0000497 0.0 nan nan
2023-09-01 22:21:02 21.512 sec 230.0 0.0003234 0.0000421 0.0 nan nan
2023-09-01 22:21:02 21.530 sec 235.0 0.0002731 0.0000366 0.0 nan nan
2023-09-01 22:21:02 21.548 sec 240.0 0.0002362 0.0000309 0.0 nan nan
2023-09-01 22:21:02 21.556 sec 241.0 0.0002324 0.0000299 0.0 nan nan
[50 rows x 9 columns]
Variable Importances:
variable relative_importance scaled_importance percentage
PRODUCT_CODE_1 1197.7524414 1.0 0.9473339
LOCATION 45.8291359 0.0382626 0.0362475
RIDER_COUNT 7.3652973 0.0061493 0.0058254
MARITAL_STATUS 5.5946555 0.0046710 0.0044250
PAYMENT_METHOD 2.5465114 0.0021261 0.0020141
ANNUAL_INCOME 2.3943691 0.0019991 0.0018938
ANNUALIZED_PREMIUM_1 2.0634596 0.0017228 0.0016320
TOTAL_COVERAGE 0.6242671 0.0005212 0.0004937
CHANNEL 0.1702214 0.0001421 0.0001346
GENDER 0.0000000 0.0000000 0.0000000
NATIONALITY 0.0 0.0 0.0

[tips]
Use `model.explain()` to inspect the model.
--
Use `h2o.display.toggle_user_tips()` to switch on/off this section.

Model 2 Predict¶

Dibawah merupakan proses dan hasil prediksi dari model kedua. Hasil testing dari model kami menghasilkan hasil yang sempurna dimana model dapat merekomendasi produk yang sesuai dengan pembelian produk ke 2 untuk client-client yang terdapat pada data testing.

In [66]:
pred1=model1.predict(test_frame)
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
In [67]:
prob1 = pred1[['P03','P07','P08']].as_data_frame()
In [68]:
prob1['Predicted'] = prob1[['P03', 'P07', 'P08']].idxmax(axis=1)
prob1
Out[68]:
P03 P07 P08 Predicted
0 5.470399e-08 9.999999e-01 6.483585e-11 P07
1 1.352110e-07 9.999999e-01 8.857160e-11 P07
2 3.426468e-05 9.999657e-01 6.659987e-09 P07
3 1.370444e-07 9.999999e-01 5.290091e-11 P07
4 1.267810e-07 9.999999e-01 6.119444e-11 P07
... ... ... ... ...
272 9.999892e-01 4.636463e-06 6.206751e-06 P03
273 5.682118e-07 9.999994e-01 1.985268e-10 P07
274 9.999710e-01 2.758238e-05 1.428226e-06 P03
275 4.651537e-08 1.000000e+00 6.414990e-11 P07
276 8.826274e-06 1.072256e-09 9.999912e-01 P08

277 rows × 4 columns

In [69]:
test['PRODUCT_CODE_PREDICT'] = prob1['Predicted'].tolist()
C:\Users\micha\AppData\Local\Temp\ipykernel_16844\2545193329.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['PRODUCT_CODE_PREDICT'] = prob1['Predicted'].tolist()
In [70]:
test['Predicted'] = test['PRODUCT_CODE_2'] == test['PRODUCT_CODE_PREDICT']
print(test['Predicted'].sum() / len(test['Predicted'])) 
0.9963898916967509
C:\Users\micha\AppData\Local\Temp\ipykernel_16844\4065189623.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Predicted'] = test['PRODUCT_CODE_2'] == test['PRODUCT_CODE_PREDICT']